HDFS-5428. Under construction files deletion after snapshot+checkpoint+nn restart leads nn safemode. Contributed by Jing Zhao.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1543329 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jing Zhao 2013-11-19 07:31:40 +00:00
parent 512475e56f
commit aa002344d0
6 changed files with 229 additions and 22 deletions

View File

@ -576,6 +576,9 @@ Release 2.3.0 - UNRELEASED
HDFS-5502. Fix HTTPS support in HsftpFileSystem. (Haohui Mai via jing9) HDFS-5502. Fix HTTPS support in HsftpFileSystem. (Haohui Mai via jing9)
HDFS-5428. Under construction files deletion after snapshot+checkpoint+nn restart
leads nn safemode. (jing9)
Release 2.2.1 - UNRELEASED Release 2.2.1 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -32,6 +32,7 @@ import java.security.DigestOutputStream;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -48,6 +49,7 @@ import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LayoutVersion; import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList; import org.apache.hadoop.hdfs.server.namenode.snapshot.FileWithSnapshot.FileDiffList;
@ -678,6 +680,12 @@ public class FSImageFormat {
if (underConstruction) { if (underConstruction) {
clientName = FSImageSerialization.readString(in); clientName = FSImageSerialization.readString(in);
clientMachine = FSImageSerialization.readString(in); clientMachine = FSImageSerialization.readString(in);
// convert the last block to BlockUC
if (blocks != null && blocks.length > 0) {
BlockInfo lastBlk = blocks[blocks.length - 1];
blocks[blocks.length - 1] = new BlockInfoUnderConstruction(
lastBlk, replication);
}
} }
} }
} }
@ -690,10 +698,15 @@ public class FSImageFormat {
} }
final INodeFile file = new INodeFile(inodeId, localName, permissions, final INodeFile file = new INodeFile(inodeId, localName, permissions,
modificationTime, atime, blocks, replication, blockSize); modificationTime, atime, blocks, replication, blockSize);
return fileDiffs != null? new INodeFileWithSnapshot(file, fileDiffs) if (underConstruction) {
: underConstruction? new INodeFileUnderConstruction( INodeFileUnderConstruction fileUC = new INodeFileUnderConstruction(
file, clientName, clientMachine, null) file, clientName, clientMachine, null);
: file; return fileDiffs == null ? fileUC :
new INodeFileUnderConstructionWithSnapshot(fileUC, fileDiffs);
} else {
return fileDiffs == null ? file :
new INodeFileWithSnapshot(file, fileDiffs);
}
} else if (numBlocks == -1) { } else if (numBlocks == -1) {
//directory //directory
@ -821,8 +834,20 @@ public class FSImageFormat {
// verify that file exists in namespace // verify that file exists in namespace
String path = cons.getLocalName(); String path = cons.getLocalName();
INodeFile oldnode = null;
boolean inSnapshot = false;
if (path != null && FSDirectory.isReservedName(path) &&
LayoutVersion.supports(Feature.ADD_INODE_ID, getLayoutVersion())) {
// TODO: for HDFS-5428, we use reserved path for those INodeFileUC in
// snapshot. If we support INode ID in the layout version, we can use
// the inode id to find the oldnode.
oldnode = namesystem.dir.getInode(cons.getId()).asFile();
inSnapshot = true;
} else {
final INodesInPath iip = fsDir.getLastINodeInPath(path); final INodesInPath iip = fsDir.getLastINodeInPath(path);
INodeFile oldnode = INodeFile.valueOf(iip.getINode(0), path); oldnode = INodeFile.valueOf(iip.getINode(0), path);
}
cons.setLocalName(oldnode.getLocalNameBytes()); cons.setLocalName(oldnode.getLocalNameBytes());
INodeReference parentRef = oldnode.getParentReference(); INodeReference parentRef = oldnode.getParentReference();
if (parentRef != null) { if (parentRef != null) {
@ -833,11 +858,23 @@ public class FSImageFormat {
if (oldnode instanceof INodeFileWithSnapshot) { if (oldnode instanceof INodeFileWithSnapshot) {
cons = new INodeFileUnderConstructionWithSnapshot(cons, cons = new INodeFileUnderConstructionWithSnapshot(cons,
((INodeFileWithSnapshot)oldnode).getDiffs()); ((INodeFileWithSnapshot) oldnode).getDiffs());
} }
if (!inSnapshot) {
fsDir.replaceINodeFile(path, oldnode, cons); fsDir.replaceINodeFile(path, oldnode, cons);
namesystem.leaseManager.addLease(cons.getClientName(), path); namesystem.leaseManager.addLease(cons.getClientName(), path);
} else {
if (parentRef != null) {
// replace oldnode with cons
parentRef.setReferredINode(cons);
} else {
// replace old node in its parent's children list and deleted list
oldnode.getParent().replaceChildFileInSnapshot(oldnode, cons);
namesystem.dir.addToInodeMap(cons);
updateBlocksMap(cons);
}
}
} }
} }
@ -917,6 +954,9 @@ public class FSImageFormat {
private MD5Hash savedDigest; private MD5Hash savedDigest;
private final ReferenceMap referenceMap = new ReferenceMap(); private final ReferenceMap referenceMap = new ReferenceMap();
private final Map<Long, INodeFileUnderConstruction> snapshotUCMap =
new HashMap<Long, INodeFileUnderConstruction>();
/** @throws IllegalStateException if the instance has not yet saved an image */ /** @throws IllegalStateException if the instance has not yet saved an image */
private void checkSaved() { private void checkSaved() {
if (!saved) { if (!saved) {
@ -992,14 +1032,22 @@ public class FSImageFormat {
// save the root // save the root
saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter); saveINode2Image(fsDir.rootDir, out, false, referenceMap, counter);
// save the rest of the nodes // save the rest of the nodes
saveImage(fsDir.rootDir, out, true, counter); saveImage(fsDir.rootDir, out, true, false, counter);
prog.endStep(Phase.SAVING_CHECKPOINT, step); prog.endStep(Phase.SAVING_CHECKPOINT, step);
// Now that the step is finished, set counter equal to total to adjust // Now that the step is finished, set counter equal to total to adjust
// for possible under-counting due to reference inodes. // for possible under-counting due to reference inodes.
prog.setCount(Phase.SAVING_CHECKPOINT, step, prog.setCount(Phase.SAVING_CHECKPOINT, step,
fsDir.rootDir.numItemsInTree()); fsDir.rootDir.numItemsInTree());
// save files under construction // save files under construction
sourceNamesystem.saveFilesUnderConstruction(out); // TODO: for HDFS-5428, since we cannot break the compatibility of
// fsimage, we store part of the under-construction files that are only
// in snapshots in this "under-construction-file" section. As a
// temporary solution, we use "/.reserved/.inodes/<inodeid>" as their
// paths, so that when loading fsimage we do not put them into the lease
// map. In the future, we can remove this hack when we can bump the
// layout version.
sourceNamesystem.saveFilesUnderConstruction(out, snapshotUCMap);
context.checkCancelled(); context.checkCancelled();
sourceNamesystem.saveSecretManagerState(out, sdPath); sourceNamesystem.saveSecretManagerState(out, sdPath);
context.checkCancelled(); context.checkCancelled();
@ -1024,20 +1072,31 @@ public class FSImageFormat {
* Save children INodes. * Save children INodes.
* @param children The list of children INodes * @param children The list of children INodes
* @param out The DataOutputStream to write * @param out The DataOutputStream to write
* @param inSnapshot Whether the parent directory or its ancestor is in
* the deleted list of some snapshot (caused by rename or
* deletion)
* @param counter Counter to increment for namenode startup progress * @param counter Counter to increment for namenode startup progress
* @return Number of children that are directory * @return Number of children that are directory
*/ */
private int saveChildren(ReadOnlyList<INode> children, DataOutputStream out, private int saveChildren(ReadOnlyList<INode> children,
Counter counter) throws IOException { DataOutputStream out, boolean inSnapshot, Counter counter)
throws IOException {
// Write normal children INode. // Write normal children INode.
out.writeInt(children.size()); out.writeInt(children.size());
int dirNum = 0; int dirNum = 0;
int i = 0; int i = 0;
for(INode child : children) { for(INode child : children) {
// print all children first // print all children first
// TODO: for HDFS-5428, we cannot change the format/content of fsimage
// here, thus even if the parent directory is in snapshot, we still
// do not handle INodeUC as those stored in deleted list
saveINode2Image(child, out, false, referenceMap, counter); saveINode2Image(child, out, false, referenceMap, counter);
if (child.isDirectory()) { if (child.isDirectory()) {
dirNum++; dirNum++;
} else if (inSnapshot && child.isFile()
&& child.asFile().isUnderConstruction()) {
this.snapshotUCMap.put(child.getId(),
(INodeFileUnderConstruction) child.asFile());
} }
if (i++ % 50 == 0) { if (i++ % 50 == 0) {
context.checkCancelled(); context.checkCancelled();
@ -1054,14 +1113,15 @@ public class FSImageFormat {
* *
* @param current The current node * @param current The current node
* @param out The DataoutputStream to write the image * @param out The DataoutputStream to write the image
* @param snapshot The possible snapshot associated with the current node
* @param toSaveSubtree Whether or not to save the subtree to fsimage. For * @param toSaveSubtree Whether or not to save the subtree to fsimage. For
* reference node, its subtree may already have been * reference node, its subtree may already have been
* saved before. * saved before.
* @param inSnapshot Whether the current directory is in snapshot
* @param counter Counter to increment for namenode startup progress * @param counter Counter to increment for namenode startup progress
*/ */
private void saveImage(INodeDirectory current, DataOutputStream out, private void saveImage(INodeDirectory current, DataOutputStream out,
boolean toSaveSubtree, Counter counter) throws IOException { boolean toSaveSubtree, boolean inSnapshot, Counter counter)
throws IOException {
// write the inode id of the directory // write the inode id of the directory
out.writeLong(current.getId()); out.writeLong(current.getId());
@ -1090,7 +1150,7 @@ public class FSImageFormat {
} }
// 3. Write children INode // 3. Write children INode
dirNum += saveChildren(children, out, counter); dirNum += saveChildren(children, out, inSnapshot, counter);
// 4. Write DirectoryDiff lists, if there is any. // 4. Write DirectoryDiff lists, if there is any.
SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap); SnapshotFSImageFormat.saveDirectoryDiffList(current, out, referenceMap);
@ -1105,14 +1165,14 @@ public class FSImageFormat {
// make sure we only save the subtree under a reference node once // make sure we only save the subtree under a reference node once
boolean toSave = child.isReference() ? boolean toSave = child.isReference() ?
referenceMap.toProcessSubtree(child.getId()) : true; referenceMap.toProcessSubtree(child.getId()) : true;
saveImage(child.asDirectory(), out, toSave, counter); saveImage(child.asDirectory(), out, toSave, inSnapshot, counter);
} }
if (snapshotDirs != null) { if (snapshotDirs != null) {
for (INodeDirectory subDir : snapshotDirs) { for (INodeDirectory subDir : snapshotDirs) {
// make sure we only save the subtree under a reference node once // make sure we only save the subtree under a reference node once
boolean toSave = subDir.getParentReference() != null ? boolean toSave = subDir.getParentReference() != null ?
referenceMap.toProcessSubtree(subDir.getId()) : true; referenceMap.toProcessSubtree(subDir.getId()) : true;
saveImage(subDir, out, toSave, counter); saveImage(subDir, out, toSave, true, counter);
} }
} }
} }

View File

@ -5929,19 +5929,40 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
/** /**
* Serializes leases. * Serializes leases.
*/ */
void saveFilesUnderConstruction(DataOutputStream out) throws IOException { void saveFilesUnderConstruction(DataOutputStream out,
Map<Long, INodeFileUnderConstruction> snapshotUCMap) throws IOException {
// This is run by an inferior thread of saveNamespace, which holds a read // This is run by an inferior thread of saveNamespace, which holds a read
// lock on our behalf. If we took the read lock here, we could block // lock on our behalf. If we took the read lock here, we could block
// for fairness if a writer is waiting on the lock. // for fairness if a writer is waiting on the lock.
synchronized (leaseManager) { synchronized (leaseManager) {
Map<String, INodeFileUnderConstruction> nodes = Map<String, INodeFileUnderConstruction> nodes =
leaseManager.getINodesUnderConstruction(); leaseManager.getINodesUnderConstruction();
out.writeInt(nodes.size()); // write the size for (Map.Entry<String, INodeFileUnderConstruction> entry
: nodes.entrySet()) {
// TODO: for HDFS-5428, because of rename operations, some
// under-construction files that are
// in the current fs directory can also be captured in the
// snapshotUCMap. We should remove them from the snapshotUCMap.
snapshotUCMap.remove(entry.getValue().getId());
}
out.writeInt(nodes.size() + snapshotUCMap.size()); // write the size
for (Map.Entry<String, INodeFileUnderConstruction> entry for (Map.Entry<String, INodeFileUnderConstruction> entry
: nodes.entrySet()) { : nodes.entrySet()) {
FSImageSerialization.writeINodeUnderConstruction( FSImageSerialization.writeINodeUnderConstruction(
out, entry.getValue(), entry.getKey()); out, entry.getValue(), entry.getKey());
} }
for (Map.Entry<Long, INodeFileUnderConstruction> entry
: snapshotUCMap.entrySet()) {
// for those snapshot INodeFileUC, we use "/.reserved/.inodes/<inodeid>"
// as their paths
StringBuilder b = new StringBuilder();
b.append(FSDirectory.DOT_RESERVED_PATH_PREFIX)
.append(Path.SEPARATOR).append(FSDirectory.DOT_INODES_STRING)
.append(Path.SEPARATOR).append(entry.getValue().getId());
FSImageSerialization.writeINodeUnderConstruction(
out, entry.getValue(), b.toString());
}
} }
} }

View File

@ -205,6 +205,23 @@ public class INodeDirectory extends INodeWithAdditionalFields
return newDir; return newDir;
} }
/**
* Used when load fileUC from fsimage. The file to be replaced is actually
* only in snapshot, thus may not be contained in the children list.
* See HDFS-5428 for details.
*/
public void replaceChildFileInSnapshot(INodeFile oldChild,
final INodeFile newChild) {
if (children != null) {
final int i = searchChildren(newChild.getLocalNameBytes());
if (i >= 0 && children.get(i).getId() == oldChild.getId()) {
// no need to consider reference node here, since we already do the
// replacement in FSImageFormat.Loader#loadFilesUnderConstruction
children.set(i, newChild);
}
}
}
/** Replace the given child with a new child. */ /** Replace the given child with a new child. */
public void replaceChild(INode oldChild, final INode newChild, public void replaceChild(INode oldChild, final INode newChild,
final INodeMap inodeMap) { final INodeMap inodeMap) {

View File

@ -38,6 +38,7 @@ import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectory; import org.apache.hadoop.hdfs.server.namenode.INodeDirectory;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryAttributes; import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryAttributes;
import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryWithQuota; import org.apache.hadoop.hdfs.server.namenode.INodeDirectoryWithQuota;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.INodeMap; import org.apache.hadoop.hdfs.server.namenode.INodeMap;
import org.apache.hadoop.hdfs.server.namenode.INodeReference; import org.apache.hadoop.hdfs.server.namenode.INodeReference;
import org.apache.hadoop.hdfs.server.namenode.Quota; import org.apache.hadoop.hdfs.server.namenode.Quota;
@ -75,7 +76,7 @@ public class INodeDirectoryWithSnapshot extends INodeDirectoryWithQuota {
final INode oldChild, final INode newChild) { final INode oldChild, final INode newChild) {
final List<INode> list = getList(type); final List<INode> list = getList(type);
final int i = search(list, oldChild.getLocalNameBytes()); final int i = search(list, oldChild.getLocalNameBytes());
if (i < 0) { if (i < 0 || list.get(i).getId() != oldChild.getId()) {
return false; return false;
} }
@ -592,6 +593,14 @@ public class INodeDirectoryWithSnapshot extends INodeDirectoryWithQuota {
return removed; return removed;
} }
@Override
public void replaceChildFileInSnapshot(final INodeFile oldChild,
final INodeFile newChild) {
super.replaceChildFileInSnapshot(oldChild, newChild);
diffs.replaceChild(ListType.DELETED, oldChild, newChild);
diffs.replaceChild(ListType.CREATED, oldChild, newChild);
}
@Override @Override
public void replaceChild(final INode oldChild, final INode newChild, public void replaceChild(final INode oldChild, final INode newChild,
final INodeMap inodeMap) { final INodeMap inodeMap) {

View File

@ -28,6 +28,11 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.client.HdfsDataOutputStream.SyncFlag; import org.apache.hadoop.hdfs.client.HdfsDataOutputStream.SyncFlag;
import org.apache.hadoop.hdfs.server.namenode.INodeId;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.security.AccessControlException;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Test; import org.junit.Test;
@ -76,6 +81,47 @@ public class TestOpenFilesWithSnapshot {
cluster.restartNameNode(); cluster.restartNameNode();
} }
@Test
public void testWithCheckpoint() throws Exception {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
fs.delete(new Path("/test/test"), true);
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
cluster.restartNameNode(true);
// read snapshot file after restart
String test2snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test2");
DFSTestUtil.readFile(fs, new Path(test2snapshotPath));
String test3snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test3");
DFSTestUtil.readFile(fs, new Path(test3snapshotPath));
}
@Test
public void testFilesDeletionWithCheckpoint() throws Exception {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
fs.delete(new Path("/test/test/test2"), true);
fs.delete(new Path("/test/test/test3"), true);
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
cluster.restartNameNode(true);
// read snapshot file after restart
String test2snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test2");
DFSTestUtil.readFile(fs, new Path(test2snapshotPath));
String test3snapshotPath = Snapshot.getSnapshotPath(path.toString(),
"s1/test/test3");
DFSTestUtil.readFile(fs, new Path(test3snapshotPath));
}
private void doWriteAndAbort(DistributedFileSystem fs, Path path) private void doWriteAndAbort(DistributedFileSystem fs, Path path)
throws IOException { throws IOException {
fs.mkdirs(path); fs.mkdirs(path);
@ -110,4 +156,55 @@ public class TestOpenFilesWithSnapshot {
DFSTestUtil.abortStream((DFSOutputStream) out2.getWrappedStream()); DFSTestUtil.abortStream((DFSOutputStream) out2.getWrappedStream());
fs.createSnapshot(path, "s1"); fs.createSnapshot(path, "s1");
} }
@Test
public void testOpenFilesWithMultipleSnapshots() throws Exception {
doTestMultipleSnapshots(true);
}
@Test
public void testOpenFilesWithMultipleSnapshotsWithoutCheckpoint()
throws Exception {
doTestMultipleSnapshots(false);
}
private void doTestMultipleSnapshots(boolean saveNamespace)
throws IOException, AccessControlException {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
fs.createSnapshot(path, "s2");
fs.delete(new Path("/test/test"), true);
fs.deleteSnapshot(path, "s2");
if (saveNamespace) {
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
}
cluster.restartNameNode(true);
}
@Test
public void testOpenFilesWithRename() throws Exception {
Path path = new Path("/test");
doWriteAndAbort(fs, path);
// check for zero sized blocks
Path fileWithEmptyBlock = new Path("/test/test/test4");
fs.create(fileWithEmptyBlock);
NamenodeProtocols nameNodeRpc = cluster.getNameNodeRpc();
String clientName = fs.getClient().getClientName();
// create one empty block
nameNodeRpc.addBlock(fileWithEmptyBlock.toString(), clientName, null, null,
INodeId.GRANDFATHER_INODE_ID, null);
fs.createSnapshot(path, "s2");
fs.rename(new Path("/test/test"), new Path("/test/test-renamed"));
fs.delete(new Path("/test/test-renamed"), true);
NameNode nameNode = cluster.getNameNode();
NameNodeAdapter.enterSafeMode(nameNode, false);
NameNodeAdapter.saveNamespace(nameNode);
NameNodeAdapter.leaveSafeMode(nameNode);
cluster.restartNameNode(true);
}
} }