From 198f2bc35e2253efb3dbc4e9945174c446917675 Mon Sep 17 00:00:00 2001 From: Arpit Agarwal Date: Wed, 28 Mar 2018 11:37:34 -0700 Subject: [PATCH] HDFS-13314. NameNode should optionally exit if it detects FsImage corruption. Contributed by Arpit Agarwal. --- .../hadoop/hdfs/server/namenode/FSImage.java | 29 ++++++++-- .../namenode/FSImageFormatProtobuf.java | 31 ++++++++--- .../snapshot/FSImageFormatPBSnapshot.java | 55 ++++++++++++++++++- 3 files changed, 101 insertions(+), 14 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java index e7581084d14..dd7df5ad6b4 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java @@ -35,6 +35,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -68,6 +69,7 @@ import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.util.Canceler; import org.apache.hadoop.hdfs.util.MD5FileUtils; import org.apache.hadoop.io.MD5Hash; +import org.apache.hadoop.util.ExitUtil; import org.apache.hadoop.util.Time; import com.google.common.base.Joiner; @@ -86,6 +88,10 @@ public class FSImage implements Closeable { protected FSEditLog editLog = null; private boolean isUpgradeFinalized = false; + // If true, then image corruption was detected. The NameNode process will + // exit immediately after saving the image. + private AtomicBoolean exitAfterSave = new AtomicBoolean(false); + protected NNStorage storage; /** @@ -954,8 +960,14 @@ public class FSImage implements Closeable { FSImageFormatProtobuf.Saver saver = new FSImageFormatProtobuf.Saver(context); FSImageCompression compression = FSImageCompression.createCompression(conf); - saver.save(newFile, compression); - + long numErrors = saver.save(newFile, compression); + if (numErrors > 0) { + // The image is likely corrupted. + LOG.error("Detected " + numErrors + " errors while saving FsImage " + + dstFile); + exitAfterSave.set(true); + } + MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest()); storage.setMostRecentCheckpointInfo(txid, Time.now()); } @@ -1117,6 +1129,12 @@ public class FSImage implements Closeable { } //Update NameDirSize Metric getStorage().updateNameDirSize(); + + if (exitAfterSave.get()) { + LOG.fatal("NameNode process will exit now... The saved FsImage " + + nnf + " is potentially corrupted."); + ExitUtil.terminate(-1); + } } /** @@ -1184,8 +1202,11 @@ public class FSImage implements Closeable { // Since we now have a new checkpoint, we can clean up some // old edit logs and checkpoints. - purgeOldStorage(nnf); - archivalManager.purgeCheckpoints(NameNodeFile.IMAGE_NEW); + // Do not purge anything if we just wrote a corrupted FsImage. + if (!exitAfterSave.get()) { + purgeOldStorage(nnf); + archivalManager.purgeCheckpoints(NameNodeFile.IMAGE_NEW); + } } finally { // Notify any threads waiting on the checkpoint to be canceled // that it is complete. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatProtobuf.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatProtobuf.java index cd5a5526cc0..4ac20adc426 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatProtobuf.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormatProtobuf.java @@ -444,15 +444,22 @@ public final class FSImageFormatProtobuf { sectionOutputStream.flush(); } - void save(File file, FSImageCompression compression) throws IOException { + /** + * @return number of non-fatal errors detected while writing the image. + * @throws IOException on fatal error. + */ + long save(File file, FSImageCompression compression) throws IOException { FileOutputStream fout = new FileOutputStream(file); fileChannel = fout.getChannel(); try { LOG.info("Saving image file {} using {}", file, compression); long startTime = monotonicNow(); - saveInternal(fout, compression, file.getAbsolutePath()); - LOG.info("Image file {} of size {} bytes saved in {} seconds.", file, - file.length(), (monotonicNow() - startTime) / 1000); + long numErrors = saveInternal( + fout, compression, file.getAbsolutePath()); + LOG.info("Image file {} of size {} bytes saved in {} seconds {}.", file, + file.length(), (monotonicNow() - startTime) / 1000, + (numErrors > 0 ? (" with" + numErrors + " errors") : "")); + return numErrors; } finally { fout.close(); } @@ -476,7 +483,11 @@ public final class FSImageFormatProtobuf { saver.serializeFilesUCSection(sectionOutputStream); } - private void saveSnapshots(FileSummary.Builder summary) throws IOException { + /** + * @return number of non-fatal errors detected while saving the image. + * @throws IOException on fatal error. + */ + private long saveSnapshots(FileSummary.Builder summary) throws IOException { FSImageFormatPBSnapshot.Saver snapshotSaver = new FSImageFormatPBSnapshot.Saver( this, summary, context, context.getSourceNamesystem()); @@ -487,9 +498,14 @@ public final class FSImageFormatProtobuf { snapshotSaver.serializeSnapshotDiffSection(sectionOutputStream); } snapshotSaver.serializeINodeReferenceSection(sectionOutputStream); + return snapshotSaver.getNumImageErrors(); } - private void saveInternal(FileOutputStream fout, + /** + * @return number of non-fatal errors detected while writing the FsImage. + * @throws IOException on fatal error. + */ + private long saveInternal(FileOutputStream fout, FSImageCompression compression, String filePath) throws IOException { StartupProgress prog = NameNode.getStartupProgress(); MessageDigest digester = MD5Hash.getDigester(); @@ -528,7 +544,7 @@ public final class FSImageFormatProtobuf { step = new Step(StepType.INODES, filePath); prog.beginStep(Phase.SAVING_CHECKPOINT, step); saveInodes(b); - saveSnapshots(b); + long numErrors = saveSnapshots(b); prog.endStep(Phase.SAVING_CHECKPOINT, step); step = new Step(StepType.DELEGATION_TOKENS, filePath); @@ -551,6 +567,7 @@ public final class FSImageFormatProtobuf { saveFileSummary(underlyingOutputStream, summary); underlyingOutputStream.close(); savedDigest = new MD5Hash(digester.digest()); + return numErrors; } private void saveSecretManagerSection(FileSummary.Builder summary) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java index bc7b0510505..0a073e97027 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/FSImageFormatPBSnapshot.java @@ -1,4 +1,4 @@ -/** + /** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -27,6 +27,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; @@ -49,6 +50,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.namenode.AclEntryStatusFormat; import org.apache.hadoop.hdfs.server.namenode.AclFeature; import org.apache.hadoop.hdfs.server.namenode.FSDirectory; +import org.apache.hadoop.hdfs.server.namenode.FSImage; import org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode; import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf; import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.LoaderContext; @@ -415,6 +417,7 @@ public class FSImageFormatPBSnapshot { private final FileSummary.Builder headers; private final FSImageFormatProtobuf.Saver parent; private final SaveNamespaceContext context; + private long numImageErrors; public Saver(FSImageFormatProtobuf.Saver parent, FileSummary.Builder headers, SaveNamespaceContext context, @@ -423,6 +426,7 @@ public class FSImageFormatPBSnapshot { this.headers = headers; this.context = context; this.fsn = fsn; + this.numImageErrors = 0; } /** @@ -471,15 +475,17 @@ public class FSImageFormatPBSnapshot { throws IOException { final List refList = parent.getSaverContext() .getRefList(); + long i = 0; for (INodeReference ref : refList) { - INodeReferenceSection.INodeReference.Builder rb = buildINodeReference(ref); + INodeReferenceSection.INodeReference.Builder rb = + buildINodeReference(ref, i++); rb.build().writeDelimitedTo(out); } parent.commitSection(headers, SectionName.INODE_REFERENCE); } private INodeReferenceSection.INodeReference.Builder buildINodeReference( - INodeReference ref) throws IOException { + final INodeReference ref, final long refIndex) throws IOException { INodeReferenceSection.INodeReference.Builder rb = INodeReferenceSection.INodeReference.newBuilder(). setReferredId(ref.getId()); @@ -489,6 +495,16 @@ public class FSImageFormatPBSnapshot { } else if (ref instanceof DstReference) { rb.setDstSnapshotId(ref.getDstSnapshotId()); } + + if (fsn.getFSDirectory().getInode(ref.getId()) == null) { + FSImage.LOG.error( + "FSImageFormatPBSnapshot: Missing referred INodeId " + + ref.getId() + " for INodeReference index " + refIndex + + "; path=" + ref.getFullPathName() + + "; parent=" + (ref.getParent() == null ? "null" : + ref.getParent().getFullPathName())); + ++numImageErrors; + } return rb; } @@ -584,7 +600,23 @@ public class FSImageFormatPBSnapshot { .getList(ListType.CREATED); db.setCreatedListSize(created.size()); List deleted = diff.getChildrenDiff().getList(ListType.DELETED); + INode previousNode = null; + boolean misordered = false; for (INode d : deleted) { + // getBytes() may return null below, and that is okay. + final int result = previousNode == null ? -1 : + previousNode.compareTo(d.getLocalNameBytes()); + if (result == 0) { + FSImage.LOG.error( + "Name '" + d.getLocalName() + "' is repeated in the " + + "'deleted' difflist of directory " + + dir.getFullPathName() + ", INodeId=" + dir.getId()); + ++numImageErrors; + } else if (result > 0 && !misordered) { + misordered = true; + ++numImageErrors; + } + previousNode = d; if (d.isReference()) { refList.add(d.asReference()); db.addDeletedINodeRef(refList.size() - 1); @@ -592,11 +624,28 @@ public class FSImageFormatPBSnapshot { db.addDeletedINode(d.getId()); } } + if (misordered) { + FSImage.LOG.error( + "Misordered entries in the 'deleted' difflist of directory " + + dir.getFullPathName() + ", INodeId=" + dir.getId() + + ". The full list is " + + Arrays.toString(deleted.toArray())); + } db.build().writeDelimitedTo(out); saveCreatedList(created, out); } } } + + + /** + * Number of non-fatal errors detected while writing the + * SnapshotDiff and INodeReference sections. + * @return the number of non-fatal errors detected. + */ + public long getNumImageErrors() { + return numImageErrors; + } } private FSImageFormatPBSnapshot(){}