diff --git a/hdfs/CHANGES.txt b/hdfs/CHANGES.txt index 2f1d6e6b03e..93ff2e38578 100644 --- a/hdfs/CHANGES.txt +++ b/hdfs/CHANGES.txt @@ -11,6 +11,10 @@ Trunk (unreleased changes) HDFS-2210. Remove hdfsproxy. (eli) + HDFS-1073. Redesign the NameNode's storage layout for image checkpoints + and edit logs to introduce transaction IDs and be more robust. + Please see HDFS-1073 section below for breakout of individual patches. + NEW FEATURES HDFS-1359. Add BlockPoolID to Block. (suresh) @@ -904,6 +908,57 @@ Trunk (unreleased changes) HDFS-1776. Bug in Concat code. (Bharath Mundlapudi via Dmytro Molkov) + BREAKDOWN OF HDFS-1073 SUBTASKS + + HDFS-1521. Persist transaction ID on disk between NN restarts. + (Ivan Kelly and Todd Lipcon via todd) + HDFS-1538. Refactor more startup and image loading code out of FSImage. + (todd) + HDFS-1729. Add code to detect valid length of an edits file. (todd) + HDFS-1793. Add code to inspect a storage directory with txid-based + filenames (todd) + HDFS-1794. Add code to list which edit logs are available on a remote NN + (todd) + HDFS-1858. Add state management variables to FSEditLog (Ivan Kelly and Todd + Lipcon via todd) + HDFS-1859. Add some convenience functions to iterate over edit log streams + (Ivan Kelly and Todd Lipcon via todd) + HDFS-1894. Add constants for LAYOUT_VERSIONs in edits log branch (todd) + HDFS-1892. Fix EditLogFileInputStream.getValidLength to be aware of + OP_INVALID filler (todd) + HDFS-1799. Refactor log rolling and filename management out of FSEditLog + (Ivan Kelly and Todd Lipcon via todd) + HDFS-1801. Remove use of timestamps to identify checkpoints and logs (todd) + HDFS-1930. TestDFSUpgrade failing in HDFS-1073 branch (todd) + HDFS-1800. Extend image checksumming to function with multiple fsimage + files per directory. (todd) + HDFS-1725. Set storage directories only at FSImage construction (Ivan Kelly + via todd) + HDFS-1926. Remove references to StorageDirectory from JournalManager + interface (Ivan Kelly via todd) + HDFS-1893. Change edit logs and images to be named based on txid (todd) + HDFS-1985. Clean up image transfer servlet (todd) + HDFS-1984. Enable multiple secondary namenodes to run simultaneously (todd) + HDFS-1987. Re-enable TestCheckpoint.testSecondaryImageDownload which was + not running previously. (todd) + HDFS-1993. TestCheckpoint needs to clean up between cases (todd) + HDFS-1992. Remove vestiges of NNStorageListener. (todd) + HDFS-1991. Some refactoring of Secondary NameNode to be able to share more + code with the BackupNode or CheckpointNode. (todd) + HDFS-1994. Fix race conditions when running two rapidly checkpointing + Secondary NameNodes. (todd) + HDFS-2001. Remove use of previous.checkpoint and lastcheckpoint.tmp + directories (todd) + HDFS-2015. Remove checkpointTxId from VERSION file. (todd) + HDFS-2016. Add infrastructure to remove or archive old and unneeded storage + files within the name directories. (todd) + HDFS-2047. Improve TestNamespace and TestEditLog in HDFS-1073 branch. + (todd) + HDFS-2048. Add upgrade tests and fix upgrade from 0.22 with corrupt image. + (todd) + HDFS-2027. Image inspector should return finalized logs before unfinalized + logs. (todd) + Release 0.22.0 - Unreleased INCOMPATIBLE CHANGES diff --git a/hdfs/bin/hdfs b/hdfs/bin/hdfs index b87deda480d..0d41b21705f 100755 --- a/hdfs/bin/hdfs +++ b/hdfs/bin/hdfs @@ -106,9 +106,7 @@ elif [ "$COMMAND" = "getconf" ] ; then elif [ "$COMMAND" = "groups" ] ; then CLASS=org.apache.hadoop.hdfs.tools.GetGroups else - echo $COMMAND - invalid command - print_usage - exit + CLASS="$COMMAND" fi # for developers, add hdfs classes to CLASSPATH diff --git a/hdfs/ivy.xml b/hdfs/ivy.xml index d52b8d58be2..ecaf026d3ea 100644 --- a/hdfs/ivy.xml +++ b/hdfs/ivy.xml @@ -67,6 +67,7 @@ + diff --git a/hdfs/ivy/libraries.properties b/hdfs/ivy/libraries.properties index 00e8331a2fe..8fc14d9d2da 100644 --- a/hdfs/ivy/libraries.properties +++ b/hdfs/ivy/libraries.properties @@ -34,6 +34,8 @@ commons-net.version=1.4.1 core.version=3.1.1 coreplugin.version=1.3.2 +guava.version=r09 + hadoop-common.version=0.23.0-SNAPSHOT hadoop-hdfs.version=0.23.0-SNAPSHOT diff --git a/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml b/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml index 7193747055b..2bfa2e0bbdd 100644 --- a/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml +++ b/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml @@ -271,9 +271,9 @@ the maximum delay between two consecutive checkpoints, and
  • - dfs.namenode.checkpoint.size, set to 64MB by default, defines the - size of the edits log file that forces an urgent checkpoint even if - the maximum checkpoint delay is not reached. + dfs.namenode.checkpoint.txns, set to 40000 default, defines the + number of uncheckpointed transactions on the NameNode which will force + an urgent checkpoint, even if the checkpoint period has not been reached.
  • @@ -322,9 +322,9 @@ the maximum delay between two consecutive checkpoints

  • - dfs.namenode.checkpoint.size, set to 64MB by default, defines the - size of the edits log file that forces an urgent checkpoint even if - the maximum checkpoint delay is not reached. + dfs.namenode.checkpoint.txns, set to 40000 default, defines the + number of uncheckpointed transactions on the NameNode which will force + an urgent checkpoint, even if the checkpoint period has not been reached.
  • diff --git a/hdfs/src/java/hdfs-default.xml b/hdfs/src/java/hdfs-default.xml index 7557d74eed7..08931de4c3f 100644 --- a/hdfs/src/java/hdfs-default.xml +++ b/hdfs/src/java/hdfs-default.xml @@ -582,10 +582,30 @@ creations/deletions), or "all". - dfs.namenode.checkpoint.size - 67108864 - The size of the current edit log (in bytes) that triggers - a periodic checkpoint even if the dfs.namenode.checkpoint.period hasn't expired. + dfs.namenode.checkpoint.txns + 40000 + The Secondary NameNode or CheckpointNode will create a checkpoint + of the namespace every 'dfs.namenode.checkpoint.txns' transactions, regardless + of whether 'dfs.namenode.checkpoint.period' has expired. + + + + + dfs.namenode.checkpoint.check.period + 60 + The SecondaryNameNode and CheckpointNode will poll the NameNode + every 'dfs.namenode.checkpoint.check.period' seconds to query the number + of uncheckpointed transactions. + + + + + dfs.namenode.num.checkpoints.retained + 2 + The number of image checkpoint files that will be retained by + the NameNode and Secondary NameNode in their storage directories. All edit + logs necessary to recover an up-to-date namespace from the oldest retained + checkpoint will also be retained. diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java index 94fbd9a00b3..b9b3a60e0f4 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java @@ -73,10 +73,12 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final int DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT = 0; public static final String DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY = "dfs.namenode.secondary.http-address"; public static final String DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_DEFAULT = "0.0.0.0:50090"; + public static final String DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY = "dfs.namenode.checkpoint.check.period"; + public static final long DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT = 60; public static final String DFS_NAMENODE_CHECKPOINT_PERIOD_KEY = "dfs.namenode.checkpoint.period"; public static final long DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT = 3600; - public static final String DFS_NAMENODE_CHECKPOINT_SIZE_KEY = "dfs.namenode.checkpoint.size"; - public static final long DFS_NAMENODE_CHECKPOINT_SIZE_DEFAULT = 4194304; + public static final String DFS_NAMENODE_CHECKPOINT_TXNS_KEY = "dfs.namenode.checkpoint.txns"; + public static final long DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT = 40000; public static final String DFS_NAMENODE_UPGRADE_PERMISSION_KEY = "dfs.namenode.upgrade.permission"; public static final int DFS_NAMENODE_UPGRADE_PERMISSION_DEFAULT = 00777; public static final String DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY = "dfs.namenode.heartbeat.recheck-interval"; @@ -110,6 +112,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys { public static final boolean DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT = false; public static final String DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY = "dfs.namenode.support.allow.format"; public static final boolean DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT = true; + public static final String DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY = "dfs.namenode.num.checkpoints.retained"; + public static final int DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT = 2; + public static final String DFS_LIST_LIMIT = "dfs.ls.limit"; public static final int DFS_LIST_LIMIT_DEFAULT = 1000; public static final String DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY = "dfs.datanode.failed.volumes.tolerated"; diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java b/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java index 33d053561a4..294e5852cbd 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java @@ -85,7 +85,6 @@ private static void addDeprecatedKeys() { deprecate("fs.checkpoint.dir", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY); deprecate("fs.checkpoint.edits.dir", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY); deprecate("fs.checkpoint.period", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY); - deprecate("fs.checkpoint.size", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_SIZE_KEY); deprecate("dfs.upgrade.permission", DFSConfigKeys.DFS_NAMENODE_UPGRADE_PERMISSION_KEY); deprecate("heartbeat.recheck.interval", DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY); deprecate("StorageId", DFSConfigKeys.DFS_DATANODE_STORAGEID_KEY); diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java index dc516ae3d94..5e803f2282b 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java @@ -65,6 +65,9 @@ public enum SafeModeAction{ SAFEMODE_LEAVE, SAFEMODE_ENTER, SAFEMODE_GET; } // type of the datanode report public static enum DatanodeReportType {ALL, LIVE, DEAD } + + // An invalid transaction ID that will never be seen in a real namesystem. + public static final long INVALID_TXID = -12345; /** * Distributed upgrade actions: diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java index 0becfd827d2..a3d7ca7fad8 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java @@ -78,7 +78,9 @@ public static enum Feature { RESERVED_REL22(-33, -27, "Reserved for release 0.22"), RESERVED_REL23(-34, -30, "Reserved for release 0.23"), FEDERATION(-35, "Support for namenode federation"), - LEASE_REASSIGNMENT(-36, "Support for persisting lease holder reassignment"); + LEASE_REASSIGNMENT(-36, "Support for persisting lease holder reassignment"), + STORED_TXIDS(-37, "Transaction IDs are stored in edits log and image files"), + TXID_BASED_LAYOUT(-38, "File names in NN Storage are based on transaction IDs"); final int lv; final int ancestorLV; diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java index 3d0a4194566..19ad35bb9a7 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java @@ -181,6 +181,16 @@ public Iterator dirIterator(StorageDirType dirType) { return new DirIterator(dirType); } + public Iterable dirIterable(final StorageDirType dirType) { + return new Iterable() { + @Override + public Iterator iterator() { + return dirIterator(dirType); + } + }; + } + + /** * generate storage list (debug line) */ @@ -568,13 +578,17 @@ public void lock() throws IOException { LOG.info("Locking is disabled"); return; } - this.lock = tryLock(); - if (lock == null) { + FileLock newLock = tryLock(); + if (newLock == null) { String msg = "Cannot lock storage " + this.root + ". The directory is already locked."; LOG.info(msg); throw new IOException(msg); } + // Don't overwrite lock until success - this way if we accidentally + // call lock twice, the internal state won't be cleared by the second + // (failed) lock attempt + lock = newLock; } /** @@ -614,6 +628,45 @@ public void unlock() throws IOException { lock.channel().close(); lock = null; } + + @Override + public String toString() { + return "Storage Directory " + this.root; + } + + /** + * Check whether underlying file system supports file locking. + * + * @return true if exclusive locks are supported or + * false otherwise. + * @throws IOException + * @see StorageDirectory#lock() + */ + public boolean isLockSupported() throws IOException { + FileLock firstLock = null; + FileLock secondLock = null; + try { + firstLock = lock; + if(firstLock == null) { + firstLock = tryLock(); + if(firstLock == null) + return true; + } + secondLock = tryLock(); + if(secondLock == null) + return true; + } finally { + if(firstLock != null && firstLock != lock) { + firstLock.release(); + firstLock.channel().close(); + } + if(secondLock != null) { + secondLock.release(); + secondLock.channel().close(); + } + } + return false; + } } /** @@ -829,41 +882,6 @@ public void unlockAll() throws IOException { } } - /** - * Check whether underlying file system supports file locking. - * - * @return true if exclusive locks are supported or - * false otherwise. - * @throws IOException - * @see StorageDirectory#lock() - */ - public boolean isLockSupported(int idx) throws IOException { - StorageDirectory sd = storageDirs.get(idx); - FileLock firstLock = null; - FileLock secondLock = null; - try { - firstLock = sd.lock; - if(firstLock == null) { - firstLock = sd.tryLock(); - if(firstLock == null) - return true; - } - secondLock = sd.tryLock(); - if(secondLock == null) - return true; - } finally { - if(firstLock != null && firstLock != sd.lock) { - firstLock.release(); - firstLock.channel().close(); - } - if(secondLock != null) { - secondLock.release(); - secondLock.channel().close(); - } - } - return false; - } - public static String getBuildVersion() { return VersionInfo.getRevision(); } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java index 5f152999ec8..95573f38f09 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java @@ -25,6 +25,8 @@ import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableUtils; +import com.google.common.base.Joiner; + /** * Common class for storage information. * @@ -105,4 +107,9 @@ public String toString() { .append(";nsid=").append(namespaceID).append(";c=").append(cTime); return sb.toString(); } + + public String toColonSeparatedString() { + return Joiner.on(":").join( + layoutVersion, namespaceID, cTime, clusterID); + } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java index e4ee76b7716..eca31fe4d04 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java @@ -19,29 +19,21 @@ import java.io.BufferedInputStream; import java.io.DataInputStream; -import java.io.File; import java.io.IOException; -import java.net.URI; -import java.util.Collection; import java.util.Iterator; -import java.util.zip.CheckedInputStream; import java.util.zip.Checksum; import org.apache.hadoop.classification.InterfaceAudience; -import org.apache.hadoop.hdfs.protocol.LayoutVersion; -import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.server.common.HdfsConstants; +import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.Storage.StorageState; -import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; -import static org.apache.hadoop.hdfs.server.common.Util.now; -import org.apache.hadoop.hdfs.server.namenode.FSImage; -import org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; -import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; -import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; -import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.LogLoadPlan; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.util.StringUtils; + +import com.google.common.base.Preconditions; /** * Extension of FSImage for the backup node. @@ -50,29 +42,56 @@ */ @InterfaceAudience.Private public class BackupImage extends FSImage { - // Names of the journal spool directory and the spool file - private static final String STORAGE_JSPOOL_DIR = "jspool"; - private static final String STORAGE_JSPOOL_FILE = - NNStorage.NameNodeFile.EDITS_NEW.getName(); - /** Backup input stream for loading edits into memory */ - private EditLogBackupInputStream backupInputStream; - - /** Is journal spooling in progress */ - volatile JSpoolState jsState; - - static enum JSpoolState { - OFF, - INPROGRESS, - WAIT; + private EditLogBackupInputStream backupInputStream = + new EditLogBackupInputStream("Data from remote NameNode"); + + /** + * Current state of the BackupNode. The BackupNode's state + * transitions are as follows: + * + * Initial: DROP_UNTIL_NEXT_ROLL + * - Transitions to JOURNAL_ONLY the next time the log rolls + * - Transitions to IN_SYNC in convergeJournalSpool + * - Transitions back to JOURNAL_ONLY if the log rolls while + * stopApplyingOnNextRoll is true. + */ + volatile BNState bnState; + static enum BNState { + /** + * Edits from the NN should be dropped. On the next log roll, + * transition to JOURNAL_ONLY state + */ + DROP_UNTIL_NEXT_ROLL, + /** + * Edits from the NN should be written to the local edits log + * but not applied to the namespace. + */ + JOURNAL_ONLY, + /** + * Edits should be written to the local edits log and applied + * to the local namespace. + */ + IN_SYNC; } /** + * Flag to indicate that the next time the NN rolls, the BN + * should transition from to JOURNAL_ONLY state. + * {@see #freezeNamespaceAtNextRoll()} */ - BackupImage() { - super(); + private boolean stopApplyingEditsOnNextRoll = false; + + /** + * Construct a backup image. + * @param conf Configuration + * @throws IOException if storage cannot be initialised. + */ + BackupImage(Configuration conf) throws IOException { + super(conf); storage.setDisablePreUpgradableLayoutCheck(true); - jsState = JSpoolState.OFF; + bnState = BNState.DROP_UNTIL_NEXT_ROLL; + editLog.initJournals(); } /** @@ -81,14 +100,9 @@ static enum JSpoolState { * Read VERSION and fstime files if exist.
    * Do not load image or edits. * - * @param imageDirs list of image directories as URI. - * @param editsDirs list of edits directories URI. * @throws IOException if the node should shutdown. */ - void recoverCreateRead(Collection imageDirs, - Collection editsDirs) throws IOException { - storage.setStorageDirectories(imageDirs, editsDirs); - storage.setCheckpointTime(0L); + void recoverCreateRead() throws IOException { for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); StorageState curState; @@ -122,283 +136,261 @@ void recoverCreateRead(Collection imageDirs, } } - /** - * Reset storage directories. - *

    - * Unlock the storage. - * Rename current to lastcheckpoint.tmp - * and recreate empty current. - * @throws IOException - */ - synchronized void reset() throws IOException { - // reset NameSpace tree - FSDirectory fsDir = getFSNamesystem().dir; - fsDir.reset(); - - // unlock, close and rename storage directories - storage.unlockAll(); - // recover from unsuccessful checkpoint if necessary - recoverCreateRead(storage.getImageDirectories(), - storage.getEditsDirectories()); - // rename and recreate - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - // rename current to lastcheckpoint.tmp - storage.moveCurrent(sd); - } - } - - /** - * Load checkpoint from local files only if the memory state is empty.
    - * Set new checkpoint time received from the name-node.
    - * Move lastcheckpoint.tmp to previous.checkpoint. - * @throws IOException - */ - void loadCheckpoint(CheckpointSignature sig) throws IOException { - // load current image and journal if it is not in memory already - if(!editLog.isOpen()) - editLog.open(); - - FSDirectory fsDir = getFSNamesystem().dir; - if(fsDir.isEmpty()) { - Iterator itImage - = storage.dirIterator(NameNodeDirType.IMAGE); - Iterator itEdits - = storage.dirIterator(NameNodeDirType.EDITS); - if(!itImage.hasNext() || ! itEdits.hasNext()) - throw new IOException("Could not locate checkpoint directories"); - StorageDirectory sdName = itImage.next(); - StorageDirectory sdEdits = itEdits.next(); - getFSDirectoryRootLock().writeLock(); - try { // load image under rootDir lock - loadFSImage(NNStorage.getStorageFile(sdName, NameNodeFile.IMAGE)); - } finally { - getFSDirectoryRootLock().writeUnlock(); - } - loadFSEdits(sdEdits); - } - - // set storage fields - storage.setStorageInfo(sig); - storage.setImageDigest(sig.imageDigest); - storage.setCheckpointTime(sig.checkpointTime); - } - /** * Save meta-data into fsimage files. * and create empty edits. */ void saveCheckpoint() throws IOException { - saveNamespace(false); - } - - private FSDirectory getFSDirectoryRootLock() { - return getFSNamesystem().dir; - } - - static File getJSpoolDir(StorageDirectory sd) { - return new File(sd.getRoot(), STORAGE_JSPOOL_DIR); - } - - static File getJSpoolFile(StorageDirectory sd) { - return new File(getJSpoolDir(sd), STORAGE_JSPOOL_FILE); + saveNamespace(); } /** - * Journal writer journals new meta-data state. - *

      - *
    1. If Journal Spool state is OFF then journal records (edits) - * are applied directly to meta-data state in memory and are written - * to the edits file(s).
    2. - *
    3. If Journal Spool state is INPROGRESS then records are only - * written to edits.new file, which is called Spooling.
    4. - *
    5. Journal Spool state WAIT blocks journaling until the - * Journal Spool reader finalizes merging of the spooled data and - * switches to applying journal to memory.
    6. - *
    - * @param length length of data. + * Receive a batch of edits from the NameNode. + * + * Depending on bnState, different actions are taken. See + * {@link BackupImage.BNState} + * + * @param firstTxId first txid in batch + * @param numTxns number of transactions * @param data serialized journal records. * @throws IOException * @see #convergeJournalSpool() */ - synchronized void journal(int length, byte[] data) throws IOException { - assert backupInputStream.length() == 0 : "backup input stream is not empty"; - try { - switch(jsState) { - case WAIT: - case OFF: - // wait until spooling is off - waitSpoolEnd(); - // update NameSpace in memory - backupInputStream.setBytes(data); - FSEditLogLoader logLoader = new FSEditLogLoader(namesystem); - int logVersion = storage.getLayoutVersion(); - BufferedInputStream bin = new BufferedInputStream(backupInputStream); - DataInputStream in = new DataInputStream(bin); - Checksum checksum = null; - if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) { - checksum = FSEditLog.getChecksum(); - in = new DataInputStream(new CheckedInputStream(bin, checksum)); - } - logLoader.loadEditRecords(logVersion, in, checksum, true); - getFSNamesystem().dir.updateCountForINodeWithQuota(); // inefficient! - break; - case INPROGRESS: - break; - } - // write to files - editLog.logEdit(length, data); - editLog.logSync(); - } finally { - backupInputStream.clear(); + synchronized void journal(long firstTxId, int numTxns, byte[] data) throws IOException { + if (LOG.isTraceEnabled()) { + LOG.trace("Got journal, " + + "state = " + bnState + + "; firstTxId = " + firstTxId + + "; numTxns = " + numTxns); } - } - - private synchronized void waitSpoolEnd() { - while(jsState == JSpoolState.WAIT) { - try { - wait(); - } catch (InterruptedException e) {} - } - // now spooling should be off, verifying just in case - assert jsState == JSpoolState.OFF : "Unexpected JSpool state: " + jsState; - } - - /** - * Start journal spool. - * Switch to writing into edits.new instead of edits. - * - * edits.new for spooling is in separate directory "spool" rather than in - * "current" because the two directories should be independent. - * While spooling a checkpoint can happen and current will first - * move to lastcheckpoint.tmp and then to previous.checkpoint - * spool/edits.new will remain in place during that. - */ - synchronized void startJournalSpool(NamenodeRegistration nnReg) - throws IOException { - switch(jsState) { - case OFF: - break; - case INPROGRESS: + + switch(bnState) { + case DROP_UNTIL_NEXT_ROLL: return; - case WAIT: - waitSpoolEnd(); + + case IN_SYNC: + // update NameSpace in memory + applyEdits(firstTxId, numTxns, data); + break; + + case JOURNAL_ONLY: + break; + + default: + throw new AssertionError("Unhandled state: " + bnState); } - - // create journal spool directories - for (Iterator it - = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) { - StorageDirectory sd = it.next(); - File jsDir = getJSpoolDir(sd); - if (!jsDir.exists() && !jsDir.mkdirs()) { - throw new IOException("Mkdirs failed to create " - + jsDir.getCanonicalPath()); - } - // create edit file if missing - File eFile = storage.getEditFile(sd); - if(!eFile.exists()) { - editLog.createEditLogFile(eFile); - } - } - - if(!editLog.isOpen()) - editLog.open(); - - // create streams pointing to the journal spool files - // subsequent journal records will go directly to the spool - editLog.divertFileStreams(STORAGE_JSPOOL_DIR + "/" + STORAGE_JSPOOL_FILE); - setCheckpointState(CheckpointStates.ROLLED_EDITS); - - // set up spooling - if(backupInputStream == null) - backupInputStream = new EditLogBackupInputStream(nnReg.getAddress()); - jsState = JSpoolState.INPROGRESS; + + // write to BN's local edit log. + logEditsLocally(firstTxId, numTxns, data); } - synchronized void setCheckpointTime(int length, byte[] data) - throws IOException { + /** + * Write the batch of edits to the local copy of the edit logs. + */ + private void logEditsLocally(long firstTxId, int numTxns, byte[] data) { + long expectedTxId = editLog.getLastWrittenTxId() + 1; + Preconditions.checkState(firstTxId == expectedTxId, + "received txid batch starting at %s but expected txn %s", + firstTxId, expectedTxId); + editLog.setNextTxId(firstTxId + numTxns - 1); + editLog.logEdit(data.length, data); + editLog.logSync(); + } + + /** + * Apply the batch of edits to the local namespace. + */ + private synchronized void applyEdits(long firstTxId, int numTxns, byte[] data) + throws IOException { + Preconditions.checkArgument(firstTxId == lastAppliedTxId + 1, + "Received txn batch starting at %s but expected %s", + firstTxId, lastAppliedTxId + 1); assert backupInputStream.length() == 0 : "backup input stream is not empty"; try { - // unpack new checkpoint time + if (LOG.isTraceEnabled()) { + LOG.debug("data:" + StringUtils.byteToHexString(data)); + } backupInputStream.setBytes(data); - DataInputStream in = backupInputStream.getDataInputStream(); - byte op = in.readByte(); - assert op == NamenodeProtocol.JA_CHECKPOINT_TIME; - LongWritable lw = new LongWritable(); - lw.readFields(in); - storage.setCheckpointTimeInStorage(lw.get()); + FSEditLogLoader logLoader = new FSEditLogLoader(namesystem); + int logVersion = storage.getLayoutVersion(); + BufferedInputStream bin = new BufferedInputStream(backupInputStream); + DataInputStream in = new DataInputStream(bin); + Checksum checksum = FSEditLog.getChecksum(); + int numLoaded = logLoader.loadEditRecords(logVersion, in, checksum, true, + lastAppliedTxId + 1); + if (numLoaded != numTxns) { + throw new IOException("Batch of txns starting at txnid " + + firstTxId + " was supposed to contain " + numTxns + + " transactions but only was able to apply " + numLoaded); + } + lastAppliedTxId += numTxns; + + getFSNamesystem().dir.updateCountForINodeWithQuota(); // inefficient! } finally { backupInputStream.clear(); } } /** - * Merge Journal Spool to memory.

    - * Journal Spool reader reads journal records from edits.new. - * When it reaches the end of the file it sets {@link JSpoolState} to WAIT. - * This blocks journaling (see {@link #journal(int,byte[])}. - * The reader - *

      - *
    • reads remaining journal records if any,
    • - *
    • renames edits.new to edits,
    • - *
    • sets {@link JSpoolState} to OFF,
    • - *
    • and notifies the journaling thread.
    • - *
    - * Journaling resumes with applying new journal records to the memory state, - * and writing them into edits file(s). + * Transition the BackupNode from JOURNAL_ONLY state to IN_SYNC state. + * This is done by repeated invocations of tryConvergeJournalSpool until + * we are caught up to the latest in-progress edits file. */ void convergeJournalSpool() throws IOException { - Iterator itEdits - = storage.dirIterator(NameNodeDirType.EDITS); - if(! itEdits.hasNext()) - throw new IOException("Could not locate checkpoint directories"); - StorageDirectory sdEdits = itEdits.next(); - int numEdits = 0; - File jSpoolFile = getJSpoolFile(sdEdits); - long startTime = now(); - if(jSpoolFile.exists()) { - // load edits.new - EditLogFileInputStream edits = new EditLogFileInputStream(jSpoolFile); - BufferedInputStream bin = new BufferedInputStream(edits); - DataInputStream in = new DataInputStream(bin); - FSEditLogLoader logLoader = new FSEditLogLoader(namesystem); - int logVersion = logLoader.readLogVersion(in); - Checksum checksum = null; - if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) { - checksum = FSEditLog.getChecksum(); - in = new DataInputStream(new CheckedInputStream(bin, checksum)); - } - numEdits += logLoader.loadEditRecords(logVersion, in, checksum, false); + Preconditions.checkState(bnState == BNState.JOURNAL_ONLY, + "bad state: %s", bnState); - // first time reached the end of spool - jsState = JSpoolState.WAIT; - numEdits += logLoader.loadEditRecords(logVersion, - in, checksum, true); - getFSNamesystem().dir.updateCountForINodeWithQuota(); - edits.close(); + while (!tryConvergeJournalSpool()) { + ; } + assert bnState == BNState.IN_SYNC; + } + + private boolean tryConvergeJournalSpool() throws IOException { + Preconditions.checkState(bnState == BNState.JOURNAL_ONLY, + "bad state: %s", bnState); + + // This section is unsynchronized so we can continue to apply + // ahead of where we're reading, concurrently. Since the state + // is JOURNAL_ONLY at this point, we know that lastAppliedTxId + // doesn't change, and curSegmentTxId only increases - FSImage.LOG.info("Edits file " + jSpoolFile.getCanonicalPath() - + " of size " + jSpoolFile.length() + " edits # " + numEdits - + " loaded in " + (now()-startTime)/1000 + " seconds."); + while (lastAppliedTxId < editLog.getCurSegmentTxId() - 1) { + long target = editLog.getCurSegmentTxId(); + LOG.info("Loading edits into backupnode to try to catch up from txid " + + lastAppliedTxId + " to " + target); + FSImageTransactionalStorageInspector inspector = + new FSImageTransactionalStorageInspector(); + + storage.inspectStorageDirs(inspector); + LogLoadPlan logLoadPlan = inspector.createLogLoadPlan(lastAppliedTxId, + target - 1); + + logLoadPlan.doRecovery(); + loadEdits(logLoadPlan.getEditsFiles()); + } + + // now, need to load the in-progress file + synchronized (this) { + if (lastAppliedTxId != editLog.getCurSegmentTxId() - 1) { + LOG.debug("Logs rolled while catching up to current segment"); + return false; // drop lock and try again to load local logs + } + + EditLogInputStream stream = getEditLog().getInProgressFileInputStream(); + try { + long remainingTxns = getEditLog().getLastWrittenTxId() - lastAppliedTxId; + + LOG.info("Going to finish converging with remaining " + remainingTxns + + " txns from in-progress stream " + stream); + + FSEditLogLoader loader = new FSEditLogLoader(namesystem); + int numLoaded = loader.loadFSEdits(stream, lastAppliedTxId + 1); + lastAppliedTxId += numLoaded; + assert numLoaded == remainingTxns : + "expected to load " + remainingTxns + " but loaded " + + numLoaded + " from " + stream; + } finally { + IOUtils.closeStream(stream); + } - // rename spool edits.new to edits making it in sync with the active node - // subsequent journal records will go directly to edits - editLog.revertFileStreams(STORAGE_JSPOOL_DIR + "/" + STORAGE_JSPOOL_FILE); + LOG.info("Successfully synced BackupNode with NameNode at txnid " + + lastAppliedTxId); + setState(BNState.IN_SYNC); + } + return true; + } - // write version file - resetVersion(false, storage.getImageDigest()); + /** + * Transition edit log to a new state, logging as necessary. + */ + private synchronized void setState(BNState newState) { + if (LOG.isDebugEnabled()) { + LOG.debug("State transition " + bnState + " -> " + newState, + new Exception("trace")); + } + bnState = newState; + } - // wake up journal writer - synchronized(this) { - jsState = JSpoolState.OFF; + /** + * Receive a notification that the NameNode has begun a new edit log. + * This causes the BN to also start the new edit log in its local + * directories. + */ + synchronized void namenodeStartedLogSegment(long txid) + throws IOException { + LOG.info("NameNode started a new log segment at txid " + txid); + if (editLog.isOpen()) { + if (editLog.getLastWrittenTxId() == txid - 1) { + // We are in sync with the NN, so end and finalize the current segment + editLog.endCurrentLogSegment(false); + } else { + // We appear to have missed some transactions -- the NN probably + // lost contact with us temporarily. So, mark the current segment + // as aborted. + LOG.warn("NN started new log segment at txid " + txid + + ", but BN had only written up to txid " + + editLog.getLastWrittenTxId() + + "in the log segment starting at " + + editLog.getCurSegmentTxId() + ". Aborting this " + + "log segment."); + editLog.abortCurrentLogSegment(); + } + } + editLog.setNextTxId(txid); + editLog.startLogSegment(txid, false); + if (bnState == BNState.DROP_UNTIL_NEXT_ROLL) { + setState(BNState.JOURNAL_ONLY); + } + + if (stopApplyingEditsOnNextRoll) { + if (bnState == BNState.IN_SYNC) { + LOG.info("Stopped applying edits to prepare for checkpoint."); + setState(BNState.JOURNAL_ONLY); + } + stopApplyingEditsOnNextRoll = false; notifyAll(); } + } - // Rename lastcheckpoint.tmp to previous.checkpoint - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - storage.moveLastCheckpoint(sd); + /** + * Request that the next time the BN receives a log roll, it should + * stop applying the edits log to the local namespace. This is + * typically followed on by a call to {@link #waitUntilNamespaceFrozen()} + */ + synchronized void freezeNamespaceAtNextRoll() { + stopApplyingEditsOnNextRoll = true; + } + + /** + * After {@link #freezeNamespaceAtNextRoll()} has been called, wait until + * the BN receives notification of the next log roll. + */ + synchronized void waitUntilNamespaceFrozen() throws IOException { + if (bnState != BNState.IN_SYNC) return; + + LOG.info("Waiting until the NameNode rolls its edit logs in order " + + "to freeze the BackupNode namespace."); + while (bnState == BNState.IN_SYNC) { + Preconditions.checkState(stopApplyingEditsOnNextRoll, + "If still in sync, we should still have the flag set to " + + "freeze at next roll"); + try { + wait(); + } catch (InterruptedException ie) { + LOG.warn("Interrupted waiting for namespace to freeze", ie); + throw new IOException(ie); + } } + LOG.info("BackupNode namespace frozen."); + } + + /** + * Override close() so that we don't finalize edit logs. + */ + @Override + public synchronized void close() throws IOException { + editLog.abortCurrentLogSegment(); + storage.close(); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java new file mode 100644 index 00000000000..35c4b7384f7 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.IOException; + +import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger; +import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; + +/** + * A JournalManager implementation that uses RPCs to log transactions + * to a BackupNode. + */ +class BackupJournalManager implements JournalManager { + + private final NamenodeRegistration nnReg; + private final NamenodeRegistration bnReg; + + BackupJournalManager(NamenodeRegistration bnReg, + NamenodeRegistration nnReg) { + this.bnReg = bnReg; + this.nnReg = nnReg; + } + + @Override + public EditLogOutputStream startLogSegment(long txId) throws IOException { + EditLogBackupOutputStream stm = new EditLogBackupOutputStream(bnReg, nnReg); + stm.startLogSegment(txId); + return stm; + } + + @Override + public void finalizeLogSegment(long firstTxId, long lastTxId) + throws IOException { + } + + @Override + public void setOutputBufferCapacity(int size) { + } + + @Override + public void purgeLogsOlderThan(long minTxIdToKeep, StoragePurger purger) + throws IOException { + } + + public boolean matchesRegistration(NamenodeRegistration bnReg) { + return bnReg.getAddress().equals(this.bnReg.getAddress()); + } + + @Override + public EditLogInputStream getInProgressInputStream(long segmentStartsAtTxId) { + return null; + } +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java index 2e7f48f2843..ad575b9e27a 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java @@ -22,21 +22,20 @@ import java.net.SocketTimeoutException; import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeys; +import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.FSConstants; +import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; +import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; -import org.apache.hadoop.hdfs.server.common.Storage; -import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; -import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature; -import org.apache.hadoop.hdfs.server.namenode.FSImage.CheckpointStates; -import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.ipc.RPC; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.net.NetUtils; /** @@ -53,7 +52,7 @@ * */ @InterfaceAudience.Private -public class BackupNode extends NameNode { +public class BackupNode extends NameNode implements JournalProtocol { private static final String BN_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY; private static final String BN_ADDRESS_DEFAULT = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT; private static final String BN_HTTP_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY; @@ -119,10 +118,9 @@ protected void setHttpServerAddress(Configuration conf){ @Override // NameNode protected void loadNamesystem(Configuration conf) throws IOException { - BackupImage bnImage = new BackupImage(); + BackupImage bnImage = new BackupImage(conf); this.namesystem = new FSNamesystem(conf, bnImage); - bnImage.recoverCreateRead(FSNamesystem.getNamespaceDirs(conf), - FSNamesystem.getNamespaceEditsDirs(conf)); + bnImage.recoverCreateRead(); } @Override // NameNode @@ -179,6 +177,17 @@ public void stop() { super.stop(); } + + @Override + public long getProtocolVersion(String protocol, long clientVersion) + throws IOException { + if (protocol.equals(JournalProtocol.class.getName())) { + return JournalProtocol.versionID; + } else { + return super.getProtocolVersion(protocol, clientVersion); + } + } + ///////////////////////////////////////////////////// // NamenodeProtocol implementation for backup node. ///////////////////////////////////////////////////// @@ -205,34 +214,36 @@ public NamenodeCommand startCheckpoint(NamenodeRegistration registration) public void endCheckpoint(NamenodeRegistration registration, CheckpointSignature sig) throws IOException { throw new UnsupportedActionException("endCheckpoint"); - } + } - @Override // NamenodeProtocol + ///////////////////////////////////////////////////// + // BackupNodeProtocol implementation for backup node. + ///////////////////////////////////////////////////// + + @Override public void journal(NamenodeRegistration nnReg, - int jAction, - int length, - byte[] args) throws IOException { + long firstTxId, int numTxns, + byte[] records) throws IOException { verifyRequest(nnReg); if(!nnRpcAddress.equals(nnReg.getAddress())) throw new IOException("Journal request from unexpected name-node: " + nnReg.getAddress() + " expecting " + nnRpcAddress); - BackupImage bnImage = (BackupImage)getFSImage(); - switch(jAction) { - case (int)JA_IS_ALIVE: - return; - case (int)JA_JOURNAL: - bnImage.journal(length, args); - return; - case (int)JA_JSPOOL_START: - bnImage.startJournalSpool(nnReg); - return; - case (int)JA_CHECKPOINT_TIME: - bnImage.setCheckpointTime(length, args); - setRegistration(); // keep registration up to date - return; - default: - throw new IOException("Unexpected journal action: " + jAction); - } + getBNImage().journal(firstTxId, numTxns, records); + } + + @Override + public void startLogSegment(NamenodeRegistration registration, long txid) + throws IOException { + verifyRequest(registration); + + getBNImage().namenodeStartedLogSegment(txid); + } + + ////////////////////////////////////////////////////// + + + BackupImage getBNImage() { + return (BackupImage)getFSImage(); } boolean shouldCheckpointAtStartup() { @@ -241,9 +252,9 @@ boolean shouldCheckpointAtStartup() { assert fsImage.getStorage().getNumStorageDirs() > 0; return ! fsImage.getStorage().getStorageDir(0).getVersionFile().exists(); } - if(namesystem == null || namesystem.dir == null || getFSImage() == null) - return true; - return fsImage.getEditLog().getNumEditStreams() == 0; + + // BN always checkpoints on startup in order to get in sync with namespace + return true; } private NamespaceInfo handshake(Configuration conf) throws IOException { @@ -287,14 +298,6 @@ void doCheckpoint() throws IOException { checkpointManager.doCheckpoint(); } - CheckpointStates getCheckpointState() { - return getFSImage().getCheckpointState(); - } - - void setCheckpointState(CheckpointStates cs) { - getFSImage().setCheckpointState(cs); - } - /** * Register this backup node with the active name-node. * @param nsInfo @@ -302,14 +305,15 @@ void setCheckpointState(CheckpointStates cs) { */ private void registerWith(NamespaceInfo nsInfo) throws IOException { BackupImage bnImage = (BackupImage)getFSImage(); + NNStorage storage = bnImage.getStorage(); // verify namespaceID - if(bnImage.getStorage().getNamespaceID() == 0) // new backup storage - bnImage.getStorage().setStorageInfo(nsInfo); - else if(bnImage.getStorage().getNamespaceID() != nsInfo.getNamespaceID()) - throw new IOException("Incompatible namespaceIDs" - + ": active node namespaceID = " + nsInfo.getNamespaceID() - + "; backup node namespaceID = " + bnImage.getStorage().getNamespaceID()); - + if (storage.getNamespaceID() == 0) { // new backup storage + storage.setStorageInfo(nsInfo); + storage.setBlockPoolID(nsInfo.getBlockPoolID()); + storage.setClusterID(nsInfo.getClusterID()); + } else { + nsInfo.validateStorage(storage); + } setRegistration(); NamenodeRegistration nnReg = null; while(!isStopRequested()) { @@ -338,23 +342,6 @@ else if(!nnReg.isRole(NamenodeRole.NAMENODE)) { nnRpcAddress = nnReg.getAddress(); } - /** - * Reset node namespace state in memory and in storage directories. - * @throws IOException - */ - void resetNamespace() throws IOException { - ((BackupImage)getFSImage()).reset(); - } - - /** - * Get size of the local journal (edit log). - * @return size of the current journal - * @throws IOException - */ - long journalSize() throws IOException { - return namesystem.getEditLogSize(); - } - // TODO: move to a common with DataNode util class private static NamespaceInfo handshake(NamenodeProtocol namenode) throws IOException, SocketTimeoutException { diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java index 7a9d970f92a..0db5cb11138 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java @@ -24,10 +24,11 @@ import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.namenode.FSImage; -import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableUtils; +import com.google.common.collect.ComparisonChain; + /** * A unique signature intended to identify checkpoint transactions. */ @@ -35,41 +36,35 @@ public class CheckpointSignature extends StorageInfo implements WritableComparable { private static final String FIELD_SEPARATOR = ":"; - long editsTime = -1L; - long checkpointTime = -1L; - MD5Hash imageDigest = null; + private static final int NUM_FIELDS = 7; + String blockpoolID = ""; + + long mostRecentCheckpointTxId; + long curSegmentTxId; public CheckpointSignature() {} CheckpointSignature(FSImage fsImage) { super(fsImage.getStorage()); blockpoolID = fsImage.getBlockPoolID(); - editsTime = fsImage.getEditLog().getFsEditTime(); - checkpointTime = fsImage.getStorage().getCheckpointTime(); - imageDigest = fsImage.getStorage().getImageDigest(); - checkpointTime = fsImage.getStorage().getCheckpointTime(); + + mostRecentCheckpointTxId = fsImage.getStorage().getMostRecentCheckpointTxId(); + curSegmentTxId = fsImage.getEditLog().getCurSegmentTxId(); } CheckpointSignature(String str) { String[] fields = str.split(FIELD_SEPARATOR); - assert fields.length == 8 : "Must be 8 fields in CheckpointSignature"; - layoutVersion = Integer.valueOf(fields[0]); - namespaceID = Integer.valueOf(fields[1]); - cTime = Long.valueOf(fields[2]); - editsTime = Long.valueOf(fields[3]); - checkpointTime = Long.valueOf(fields[4]); - imageDigest = new MD5Hash(fields[5]); - clusterID = fields[6]; - blockpoolID = fields[7]; - } - - /** - * Get the MD5 image digest - * @return the MD5 image digest - */ - MD5Hash getImageDigest() { - return imageDigest; + assert fields.length == NUM_FIELDS : + "Must be " + NUM_FIELDS + " fields in CheckpointSignature"; + int i = 0; + layoutVersion = Integer.valueOf(fields[i++]); + namespaceID = Integer.valueOf(fields[i++]); + cTime = Long.valueOf(fields[i++]); + mostRecentCheckpointTxId = Long.valueOf(fields[i++]); + curSegmentTxId = Long.valueOf(fields[i++]); + clusterID = fields[i++]; + blockpoolID = fields[i++]; } /** @@ -101,33 +96,26 @@ public String toString() { return String.valueOf(layoutVersion) + FIELD_SEPARATOR + String.valueOf(namespaceID) + FIELD_SEPARATOR + String.valueOf(cTime) + FIELD_SEPARATOR - + String.valueOf(editsTime) + FIELD_SEPARATOR - + String.valueOf(checkpointTime) + FIELD_SEPARATOR - + imageDigest.toString() + FIELD_SEPARATOR + + String.valueOf(mostRecentCheckpointTxId) + FIELD_SEPARATOR + + String.valueOf(curSegmentTxId) + FIELD_SEPARATOR + clusterID + FIELD_SEPARATOR + blockpoolID ; } void validateStorageInfo(FSImage si) throws IOException { - if(layoutVersion != si.getLayoutVersion() - || namespaceID != si.getNamespaceID() - || cTime != si.getStorage().cTime - || checkpointTime != si.getStorage().getCheckpointTime() - || !imageDigest.equals(si.getStorage().imageDigest) - || !clusterID.equals(si.getClusterID()) - || !blockpoolID.equals(si.getBlockPoolID())) { - // checkpointTime can change when the image is saved - do not compare + if(layoutVersion != si.getStorage().layoutVersion + || namespaceID != si.getStorage().namespaceID + || cTime != si.getStorage().cTime + || !clusterID.equals(si.getClusterID()) + || !blockpoolID.equals(si.getBlockPoolID())) { throw new IOException("Inconsistent checkpoint fields.\n" + "LV = " + layoutVersion + " namespaceID = " + namespaceID - + " cTime = " + cTime + "; checkpointTime = " + checkpointTime - + " ; imageDigest = " + imageDigest + + " cTime = " + cTime + " ; clusterId = " + clusterID + " ; blockpoolId = " + blockpoolID + ".\nExpecting respectively: " - + si.getLayoutVersion() + "; " - + si.getNamespaceID() + "; " + si.getStorage().cTime - + "; " + si.getStorage().getCheckpointTime() + "; " - + si.getStorage().imageDigest + + si.getStorage().layoutVersion + "; " + + si.getStorage().namespaceID + "; " + si.getStorage().cTime + "; " + si.getClusterID() + "; " + si.getBlockPoolID() + "."); } @@ -137,19 +125,15 @@ void validateStorageInfo(FSImage si) throws IOException { // Comparable interface // public int compareTo(CheckpointSignature o) { - return - (layoutVersion < o.layoutVersion) ? -1 : - (layoutVersion > o.layoutVersion) ? 1 : - (namespaceID < o.namespaceID) ? -1 : (namespaceID > o.namespaceID) ? 1 : - (cTime < o.cTime) ? -1 : (cTime > o.cTime) ? 1 : - (editsTime < o.editsTime) ? -1 : (editsTime > o.editsTime) ? 1 : - (checkpointTime < o.checkpointTime) ? -1 : - (checkpointTime > o.checkpointTime) ? 1 : - (clusterID.compareTo(o.clusterID) < 0) ? -1 : - (clusterID.compareTo(o.clusterID) > 0) ? 1 : - (blockpoolID.compareTo(o.blockpoolID) < 0) ? -1 : - (blockpoolID.compareTo(o.blockpoolID) > 0) ? 1 : - imageDigest.compareTo(o.imageDigest); + return ComparisonChain.start() + .compare(layoutVersion, o.layoutVersion) + .compare(namespaceID, o.namespaceID) + .compare(cTime, o.cTime) + .compare(mostRecentCheckpointTxId, o.mostRecentCheckpointTxId) + .compare(curSegmentTxId, o.curSegmentTxId) + .compare(clusterID, o.clusterID) + .compare(blockpoolID, o.blockpoolID) + .result(); } public boolean equals(Object o) { @@ -161,9 +145,8 @@ public boolean equals(Object o) { public int hashCode() { return layoutVersion ^ namespaceID ^ - (int)(cTime ^ editsTime ^ checkpointTime) ^ - imageDigest.hashCode() ^ clusterID.hashCode() - ^ blockpoolID.hashCode(); + (int)(cTime ^ mostRecentCheckpointTxId ^ curSegmentTxId) + ^ clusterID.hashCode() ^ blockpoolID.hashCode(); } ///////////////////////////////////////////////// @@ -172,17 +155,14 @@ public int hashCode() { public void write(DataOutput out) throws IOException { super.write(out); WritableUtils.writeString(out, blockpoolID); - out.writeLong(editsTime); - out.writeLong(checkpointTime); - imageDigest.write(out); + out.writeLong(mostRecentCheckpointTxId); + out.writeLong(curSegmentTxId); } public void readFields(DataInput in) throws IOException { super.readFields(in); blockpoolID = WritableUtils.readString(in); - editsTime = in.readLong(); - checkpointTime = in.readLong(); - imageDigest = new MD5Hash(); - imageDigest.readFields(in); + mostRecentCheckpointTxId = in.readLong(); + curSegmentTxId = in.readLong(); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java index ee5e87c9aa8..49f713015c4 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java @@ -17,29 +17,30 @@ */ package org.apache.hadoop.hdfs.server.namenode; -import java.io.IOException; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_DEFAULT; +import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY; +import static org.apache.hadoop.hdfs.server.common.Util.now; + import java.io.File; +import java.io.IOException; import java.net.InetSocketAddress; -import java.util.Collection; +import java.util.List; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; - -import org.apache.hadoop.hdfs.protocol.FSConstants; -import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; -import static org.apache.hadoop.hdfs.server.common.Util.now; -import org.apache.hadoop.hdfs.server.namenode.FSImage.CheckpointStates; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; -import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand; -import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; -import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; -import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; +import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand; +import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; +import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.http.HttpServer; +import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.util.Daemon; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY; -import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_DEFAULT; + +import com.google.common.collect.Lists; /** * The Checkpointer is responsible for supporting periodic checkpoints @@ -59,7 +60,7 @@ class Checkpointer extends Daemon { private BackupNode backupNode; volatile boolean shouldRun; private long checkpointPeriod; // in seconds - private long checkpointSize; // size (in MB) of current Edit Log + private long checkpointTxnCount; // size (in MB) of current Edit Log private String infoBindAddress; @@ -79,6 +80,7 @@ private NamenodeProtocol getNamenode(){ try { initialize(conf); } catch(IOException e) { + LOG.warn("Checkpointer got exception", e); shutdown(); throw e; } @@ -87,6 +89,7 @@ private NamenodeProtocol getNamenode(){ /** * Initialize checkpoint. */ + @SuppressWarnings("deprecation") private void initialize(Configuration conf) throws IOException { // Create connection to the namenode. shouldRun = true; @@ -94,8 +97,9 @@ private void initialize(Configuration conf) throws IOException { // Initialize other scheduling parameters from the configuration checkpointPeriod = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); - checkpointSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_SIZE_KEY, - DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_SIZE_DEFAULT); + checkpointTxnCount = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); + SecondaryNameNode.warnForDeprecatedConfigs(conf); // Pull out exact http address for posting url to avoid ip aliasing issues String fullInfoAddr = conf.get(DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY, @@ -104,8 +108,7 @@ private void initialize(Configuration conf) throws IOException { LOG.info("Checkpoint Period : " + checkpointPeriod + " secs " + "(" + checkpointPeriod/60 + " min)"); - LOG.info("Log Size Trigger : " + checkpointSize + " bytes " + - "(" + checkpointSize/1024 + " KB)"); + LOG.info("Log Size Trigger : " + checkpointTxnCount + " txns "); } /** @@ -128,8 +131,9 @@ public void run() { periodMSec *= 1000; long lastCheckpointTime = 0; - if(!backupNode.shouldCheckpointAtStartup()) + if (!backupNode.shouldCheckpointAtStartup()) { lastCheckpointTime = now(); + } while(shouldRun) { try { long now = now(); @@ -137,8 +141,8 @@ public void run() { if(now >= lastCheckpointTime + periodMSec) { shouldCheckpoint = true; } else { - long size = getJournalSize(); - if(size >= checkpointSize) + long txns = countUncheckpointedTxns(); + if(txns >= checkpointTxnCount) shouldCheckpoint = true; } if(shouldCheckpoint) { @@ -160,64 +164,24 @@ public void run() { } } - private long getJournalSize() throws IOException { - // If BACKUP node has been loaded - // get edits size from the local file. ACTIVE has the same. - if(backupNode.isRole(NamenodeRole.BACKUP) - && getFSImage().getEditLog().isOpen()) - return backupNode.journalSize(); - // Go to the ACTIVE node for its size - return getNamenode().journalSize(backupNode.getRegistration()); - } - - /** - * Download fsimage and edits - * files from the remote name-node. - */ - private void downloadCheckpoint(CheckpointSignature sig) throws IOException { - // Retrieve image file - String fileid = "getimage=1"; - Collection list = getFSImage() - .getStorage().getFiles(NameNodeFile.IMAGE, NameNodeDirType.IMAGE); - File[] files = list.toArray(new File[list.size()]); - assert files.length > 0 : "No checkpoint targets."; - String nnHttpAddr = backupNode.nnHttpAddress; - TransferFsImage.getFileClient(nnHttpAddr, fileid, files, false); - LOG.info("Downloaded file " + files[0].getName() + " size " + - files[0].length() + " bytes."); - - // Retrieve edits file - fileid = "getedit=1"; - list = getFSImage() - .getStorage().getFiles(NameNodeFile.EDITS, NameNodeDirType.EDITS); - files = list.toArray(new File[list.size()]); - assert files.length > 0 : "No checkpoint targets."; - TransferFsImage.getFileClient(nnHttpAddr, fileid, files, false); - LOG.info("Downloaded file " + files[0].getName() + " size " + - files[0].length() + " bytes."); - } - - /** - * Copy the new image into remote name-node. - */ - private void uploadCheckpoint(CheckpointSignature sig) throws IOException { - // Use the exact http addr as specified in config to deal with ip aliasing - InetSocketAddress httpSocAddr = backupNode.getHttpAddress(); - int httpPort = httpSocAddr.getPort(); - String fileid = "putimage=1&port=" + httpPort + - "&machine=" + infoBindAddress + - "&token=" + sig.toString() + - "&newChecksum=" + getFSImage().getStorage().getImageDigest().toString(); - LOG.info("Posted URL " + backupNode.nnHttpAddress + fileid); - TransferFsImage.getFileClient(backupNode.nnHttpAddress, - fileid, (File[])null, false); + private long countUncheckpointedTxns() throws IOException { + long curTxId = getNamenode().getTransactionID(); + long uncheckpointedTxns = curTxId - + getFSImage().getStorage().getMostRecentCheckpointTxId(); + assert uncheckpointedTxns >= 0; + return uncheckpointedTxns; } /** * Create a new checkpoint */ void doCheckpoint() throws IOException { + BackupImage bnImage = getFSImage(); + NNStorage bnStorage = bnImage.getStorage(); + long startTime = now(); + bnImage.freezeNamespaceAtNextRoll(); + NamenodeCommand cmd = getNamenode().startCheckpoint(backupNode.getRegistration()); CheckpointCommand cpCmd = null; @@ -233,37 +197,94 @@ void doCheckpoint() throws IOException { throw new IOException("Unsupported NamenodeCommand: "+cmd.getAction()); } + bnImage.waitUntilNamespaceFrozen(); + CheckpointSignature sig = cpCmd.getSignature(); - assert FSConstants.LAYOUT_VERSION == sig.getLayoutVersion() : - "Signature should have current layout version. Expected: " - + FSConstants.LAYOUT_VERSION + " actual "+ sig.getLayoutVersion(); - assert !backupNode.isRole(NamenodeRole.CHECKPOINT) || - cpCmd.isImageObsolete() : "checkpoint node should always download image."; - backupNode.setCheckpointState(CheckpointStates.UPLOAD_START); - if(cpCmd.isImageObsolete()) { - // First reset storage on disk and memory state - backupNode.resetNamespace(); - downloadCheckpoint(sig); + + // Make sure we're talking to the same NN! + sig.validateStorageInfo(bnImage); + + long lastApplied = bnImage.getLastAppliedTxId(); + LOG.debug("Doing checkpoint. Last applied: " + lastApplied); + RemoteEditLogManifest manifest = + getNamenode().getEditLogManifest(bnImage.getLastAppliedTxId()); + + if (!manifest.getLogs().isEmpty()) { + RemoteEditLog firstRemoteLog = manifest.getLogs().get(0); + // we don't have enough logs to roll forward using only logs. Need + // to download and load the image. + if (firstRemoteLog.getStartTxId() > lastApplied + 1) { + LOG.info("Unable to roll forward using only logs. Downloading " + + "image with txid " + sig.mostRecentCheckpointTxId); + MD5Hash downloadedHash = TransferFsImage.downloadImageToStorage( + backupNode.nnHttpAddress, sig.mostRecentCheckpointTxId, + bnStorage, true); + bnImage.saveDigestAndRenameCheckpointImage( + sig.mostRecentCheckpointTxId, downloadedHash); + + LOG.info("Loading image with txid " + sig.mostRecentCheckpointTxId); + File file = bnStorage.findImageFile(sig.mostRecentCheckpointTxId); + bnImage.reloadFromImageFile(file); + } + + lastApplied = bnImage.getLastAppliedTxId(); + if (firstRemoteLog.getStartTxId() > lastApplied + 1) { + throw new IOException("No logs to roll forward from " + lastApplied); + } + + // get edits files + for (RemoteEditLog log : manifest.getLogs()) { + TransferFsImage.downloadEditsToStorage( + backupNode.nnHttpAddress, log, bnStorage); + } + + rollForwardByApplyingLogs(manifest, bnImage); } - BackupImage bnImage = getFSImage(); - bnImage.getStorage().setBlockPoolID(backupNode.getBlockPoolId()); - bnImage.getStorage().setClusterID(backupNode.getClusterId()); - bnImage.loadCheckpoint(sig); - sig.validateStorageInfo(bnImage); - bnImage.saveCheckpoint(); + long txid = bnImage.getLastAppliedTxId(); + bnImage.saveFSImageInAllDirs(txid); + bnStorage.writeAll(); - if(cpCmd.needToReturnImage()) - uploadCheckpoint(sig); + if(cpCmd.needToReturnImage()) { + TransferFsImage.uploadImageFromStorage( + backupNode.nnHttpAddress, getImageListenAddress(), + bnStorage, txid); + } getNamenode().endCheckpoint(backupNode.getRegistration(), sig); - bnImage.convergeJournalSpool(); + if (backupNode.getRole() == NamenodeRole.BACKUP) { + bnImage.convergeJournalSpool(); + } backupNode.setRegistration(); // keep registration up to date - if(backupNode.isRole(NamenodeRole.CHECKPOINT)) - getFSImage().getEditLog().close(); + + long imageSize = bnImage.getStorage().getFsImageName(txid).length(); LOG.info("Checkpoint completed in " + (now() - startTime)/1000 + " seconds." - + " New Image Size: " + bnImage.getStorage().getFsImageName().length()); + + " New Image Size: " + imageSize); + } + + private InetSocketAddress getImageListenAddress() { + InetSocketAddress httpSocAddr = backupNode.getHttpAddress(); + int httpPort = httpSocAddr.getPort(); + return new InetSocketAddress(infoBindAddress, httpPort); + } + + static void rollForwardByApplyingLogs( + RemoteEditLogManifest manifest, + FSImage dstImage) throws IOException { + NNStorage dstStorage = dstImage.getStorage(); + + List editsFiles = Lists.newArrayList(); + for (RemoteEditLog log : manifest.getLogs()) { + File f = dstStorage.findFinalizedEditsFile( + log.getStartTxId(), log.getEndTxId()); + if (log.getStartTxId() > dstImage.getLastAppliedTxId()) { + editsFiles.add(f); + } + } + LOG.info("Checkpointer about to load edits from " + + editsFiles.size() + " file(s)."); + dstImage.loadEdits(editsFiles); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java index 81992586e2d..007578f28a6 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java @@ -42,10 +42,6 @@ private static class ByteBufferInputStream extends ByteArrayInputStream { super(new byte[0]); } - byte[] getData() { - return super.buf; - } - void setData(byte[] newBytes) { super.buf = newBytes; super.count = newBytes == null ? 0 : newBytes.length; diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java index 5e9d5398ed6..5dd82393fd0 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java @@ -19,10 +19,11 @@ import java.io.IOException; import java.net.InetSocketAddress; +import java.util.Arrays; import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.server.common.Storage; -import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.JournalProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.ipc.RPC; @@ -39,7 +40,7 @@ class EditLogBackupOutputStream extends EditLogOutputStream { static int DEFAULT_BUFFER_SIZE = 256; - private NamenodeProtocol backupNode; // RPC proxy to backup node + private JournalProtocol backupNode; // RPC proxy to backup node private NamenodeRegistration bnRegistration; // backup node registration private NamenodeRegistration nnRegistration; // active node registration private EditsDoubleBuffer doubleBuf; @@ -56,8 +57,8 @@ class EditLogBackupOutputStream extends EditLogOutputStream { Storage.LOG.info("EditLogBackupOutputStream connects to: " + bnAddress); try { this.backupNode = - (NamenodeProtocol) RPC.getProxy(NamenodeProtocol.class, - NamenodeProtocol.versionID, bnAddress, new HdfsConfiguration()); + RPC.getProxy(JournalProtocol.class, + JournalProtocol.versionID, bnAddress, new HdfsConfiguration()); } catch(IOException e) { Storage.LOG.error("Error connecting to: " + bnAddress, e); throw e; @@ -65,7 +66,7 @@ class EditLogBackupOutputStream extends EditLogOutputStream { this.doubleBuf = new EditsDoubleBuffer(DEFAULT_BUFFER_SIZE); this.out = new DataOutputBuffer(DEFAULT_BUFFER_SIZE); } - + @Override // JournalStream public String getName() { return bnRegistration.getAddress(); @@ -109,6 +110,12 @@ public void close() throws IOException { doubleBuf = null; } + @Override + public void abort() throws IOException { + RPC.stopProxy(backupNode); + doubleBuf = null; + } + @Override // EditLogOutputStream void setReadyToFlush() throws IOException { doubleBuf.setReadyToFlush(); @@ -116,11 +123,21 @@ void setReadyToFlush() throws IOException { @Override // EditLogOutputStream protected void flushAndSync() throws IOException { - // XXX: this code won't work in trunk, but it's redone - // in HDFS-1073 where it's simpler. + assert out.getLength() == 0 : "Output buffer is not empty"; + + int numReadyTxns = doubleBuf.countReadyTxns(); + long firstTxToFlush = doubleBuf.getFirstReadyTxId(); + doubleBuf.flushTo(out); - if (out.size() > 0) { - send(NamenodeProtocol.JA_JOURNAL); + if (out.getLength() > 0) { + assert numReadyTxns > 0; + + byte[] data = Arrays.copyOf(out.getData(), out.getLength()); + out.reset(); + assert out.getLength() == 0 : "Output buffer is not empty"; + + backupNode.journal(nnRegistration, + firstTxToFlush, numReadyTxns, data); } } @@ -134,16 +151,6 @@ long length() throws IOException { return 0; } - private void send(int ja) throws IOException { - try { - int length = out.getLength(); - out.write(FSEditLogOpCodes.OP_INVALID.getOpCode()); - backupNode.journal(nnRegistration, ja, length, out.getData()); - } finally { - out.reset(); - } - } - /** * Get backup node registration. */ @@ -151,17 +158,7 @@ NamenodeRegistration getRegistration() { return bnRegistration; } - /** - * Verify that the backup node is alive. - */ - boolean isAlive() { - try { - send(NamenodeProtocol.JA_IS_ALIVE); - } catch(IOException ei) { - Storage.LOG.info(bnRegistration.getRole() + " " - + bnRegistration.getAddress() + " is not alive. ", ei); - return false; - } - return true; + void startLogSegment(long txId) throws IOException { + backupNode.startLogSegment(nnRegistration, txId); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java index b27b8cfbc4e..2ae0bd038f9 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java @@ -70,4 +70,10 @@ long length() throws IOException { // file size + size of both buffers return file.length(); } + + @Override + public String toString() { + return getName(); + } + } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java index 44130015ad9..f79f44266e0 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java @@ -25,6 +25,8 @@ import java.nio.ByteBuffer; import java.nio.channels.FileChannel; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.io.IOUtils; @@ -35,6 +37,8 @@ * stores edits in a local file. */ class EditLogFileOutputStream extends EditLogOutputStream { + private static Log LOG = LogFactory.getLog(EditLogFileOutputStream.class);; + private static int EDITS_FILE_HEADER_SIZE_BYTES = Integer.SIZE / Byte.SIZE; private File file; @@ -85,7 +89,14 @@ void write(FSEditLogOp op) throws IOException { doubleBuf.writeOp(op); } - /** {@inheritDoc} */ + /** + * Write a transaction to the stream. The serialization format is: + *
      + *
    • the opcode (byte)
    • + *
    • the transaction id (long)
    • + *
    • the actual Writables for the transaction
    • + *
    + * */ @Override void writeRaw(byte[] bytes, int offset, int length) throws IOException { doubleBuf.writeRaw(bytes, offset, length); @@ -105,6 +116,10 @@ void create() throws IOException { @Override public void close() throws IOException { + if (fp == null) { + throw new IOException("Trying to use aborted output stream"); + } + try { // close should have been called after all pending transactions // have been flushed & synced. @@ -130,6 +145,16 @@ public void close() throws IOException { fc = null; fp = null; } + fp = null; + } + + @Override + public void abort() throws IOException { + if (fp == null) { + return; + } + IOUtils.cleanup(LOG, fp); + fp = null; } /** @@ -148,6 +173,10 @@ void setReadyToFlush() throws IOException { */ @Override protected void flushAndSync() throws IOException { + if (fp == null) { + throw new IOException("Trying to use aborted output stream"); + } + preallocate(); // preallocate file if necessary doubleBuf.flushTo(fp); fc.force(false); // metadata updates not needed because of preallocation @@ -189,21 +218,19 @@ private void preallocate() throws IOException { } } - /** - * Operations like OP_JSPOOL_START and OP_CHECKPOINT_TIME should not be - * written into edits file. - */ - @Override - boolean isOperationSupported(byte op) { - return op < FSEditLogOpCodes.OP_JSPOOL_START.getOpCode() - 1; - } - /** * Returns the file associated with this stream. */ File getFile() { return file; } + + /** + * @return true if this stream is currently open. + */ + public boolean isOpen() { + return fp != null; + } @VisibleForTesting public void setFileChannelForTesting(FileChannel fc) { diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java index eb16d692928..8577db8e457 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java @@ -18,9 +18,13 @@ package org.apache.hadoop.hdfs.server.namenode; import java.io.IOException; +import java.util.zip.Checksum; import static org.apache.hadoop.hdfs.server.common.Util.now; +import org.apache.hadoop.io.DataOutputBuffer; +import org.apache.hadoop.io.Writable; + /** * A generic abstract class to support journaling of edits logs into * a persistent storage. @@ -63,9 +67,19 @@ abstract void writeRaw(byte[] bytes, int offset, int length) */ abstract void create() throws IOException; - /** {@inheritDoc} */ + /** + * Close the journal. + * @throws IOException if the journal can't be closed, + * or if there are unflushed edits + */ abstract public void close() throws IOException; + /** + * Close the stream without necessarily flushing any pending data. + * This may be called after a previous write or close threw an exception. + */ + abstract public void abort() throws IOException; + /** * All data that has been written to the stream so far will be flushed. * New data can be still written to the stream while flushing is performed. @@ -108,10 +122,6 @@ public boolean shouldForceSync() { return false; } - boolean isOperationSupported(byte op) { - return true; - } - /** * Return total time spent in {@link #flushAndSync()} */ diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java index 60cb9e6dba7..0dd90588f44 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.io.OutputStream; +import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.Writer; import org.apache.hadoop.io.DataOutputBuffer; import org.apache.hadoop.io.IOUtils; @@ -35,20 +36,19 @@ */ class EditsDoubleBuffer { - private DataOutputBuffer bufCurrent; // current buffer for writing - private DataOutputBuffer bufReady; // buffer ready for flushing + private TxnBuffer bufCurrent; // current buffer for writing + private TxnBuffer bufReady; // buffer ready for flushing private final int initBufferSize; - private Writer writer; public EditsDoubleBuffer(int defaultBufferSize) { initBufferSize = defaultBufferSize; - bufCurrent = new DataOutputBuffer(initBufferSize); - bufReady = new DataOutputBuffer(initBufferSize); - writer = new FSEditLogOp.Writer(bufCurrent); + bufCurrent = new TxnBuffer(initBufferSize); + bufReady = new TxnBuffer(initBufferSize); + } public void writeOp(FSEditLogOp op) throws IOException { - writer.writeOp(op); + bufCurrent.writeOp(op); } void writeRaw(byte[] bytes, int offset, int length) throws IOException { @@ -71,10 +71,9 @@ void close() throws IOException { void setReadyToFlush() { assert isFlushed() : "previous data not flushed yet"; - DataOutputBuffer tmp = bufReady; + TxnBuffer tmp = bufReady; bufReady = bufCurrent; bufCurrent = tmp; - writer = new FSEditLogOp.Writer(bufCurrent); } /** @@ -102,4 +101,50 @@ public int countBufferedBytes() { return bufReady.size() + bufCurrent.size(); } + /** + * @return the transaction ID of the first transaction ready to be flushed + */ + public long getFirstReadyTxId() { + assert bufReady.firstTxId > 0; + return bufReady.firstTxId; + } + + /** + * @return the number of transactions that are ready to be flushed + */ + public int countReadyTxns() { + return bufReady.numTxns; + } + + + private static class TxnBuffer extends DataOutputBuffer { + long firstTxId; + int numTxns; + private Writer writer; + + public TxnBuffer(int initBufferSize) { + super(initBufferSize); + writer = new FSEditLogOp.Writer(this); + reset(); + } + + public void writeOp(FSEditLogOp op) throws IOException { + if (firstTxId == FSConstants.INVALID_TXID) { + firstTxId = op.txid; + } else { + assert op.txid > firstTxId; + } + writer.writeOp(op); + numTxns++; + } + + @Override + public DataOutputBuffer reset() { + super.reset(); + firstTxId = FSConstants.INVALID_TXID; + numTxns = 0; + return this; + } + } + } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java index 6ac35efa103..7c59bdf5722 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java @@ -22,9 +22,7 @@ import java.io.Closeable; import java.io.FileNotFoundException; import java.io.IOException; -import java.net.URI; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Condition; @@ -157,26 +155,33 @@ private BlockManager getBlockManager() { return getFSNamesystem().getBlockManager(); } - void loadFSImage(Collection dataDirs, - Collection editsDirs, - StartupOption startOpt) + /** + * Load the filesystem image into memory. + * + * @param startOpt Startup type as specified by the user. + * @throws IOException If image or editlog cannot be read. + */ + void loadFSImage(StartupOption startOpt) throws IOException { // format before starting up if requested if (startOpt == StartupOption.FORMAT) { - fsImage.getStorage().setStorageDirectories(dataDirs, editsDirs); - fsImage.getStorage().format(fsImage.getStorage().determineClusterId()); // reuse current id + fsImage.format(fsImage.getStorage().determineClusterId());// reuse current id + startOpt = StartupOption.REGULAR; } + boolean success = false; try { - if (fsImage.recoverTransitionRead(dataDirs, editsDirs, startOpt)) { - fsImage.saveNamespace(true); + if (fsImage.recoverTransitionRead(startOpt)) { + fsImage.saveNamespace(); } - FSEditLog editLog = fsImage.getEditLog(); - assert editLog != null : "editLog must be initialized"; + fsImage.openEditLog(); + fsImage.setCheckpointDirectories(null, null); - } catch(IOException e) { - fsImage.close(); - throw e; + success = true; + } finally { + if (!success) { + fsImage.close(); + } } writeLock(); try { diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java index 0321d20b87b..bf137ae9c62 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java @@ -17,10 +17,7 @@ */ package org.apache.hadoop.hdfs.server.namenode; -import java.io.File; import java.io.IOException; -import java.io.DataOutputStream; -import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.zip.Checksum; @@ -33,22 +30,26 @@ import org.apache.hadoop.fs.Options; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hdfs.protocol.Block; +import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier; -import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import static org.apache.hadoop.hdfs.server.common.Util.now; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NNStorageListener; -import org.apache.hadoop.hdfs.server.namenode.JournalStream.JournalType; +import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger; import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; +import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Writable; import org.apache.hadoop.security.token.delegation.DelegationKey; import org.apache.hadoop.util.PureJavaCrc32; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.*; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.*; @@ -58,23 +59,42 @@ */ @InterfaceAudience.Private @InterfaceStability.Evolving -public class FSEditLog implements NNStorageListener { +public class FSEditLog { static final String NO_JOURNAL_STREAMS_WARNING = "!!! WARNING !!!" + " File system changes are not persistent. No journal streams."; - private static final Log LOG = LogFactory.getLog(FSEditLog.class); + static final Log LOG = LogFactory.getLog(FSEditLog.class); - private volatile int sizeOutputFlushBuffer = 512*1024; + /** + * State machine for edit log. + * The log starts in UNITIALIZED state upon construction. Once it's + * initialized, it is usually in IN_SEGMENT state, indicating that edits + * may be written. In the middle of a roll, or while saving the namespace, + * it briefly enters the BETWEEN_LOG_SEGMENTS state, indicating that the + * previous segment has been closed, but the new one has not yet been opened. + */ + private enum State { + UNINITIALIZED, + BETWEEN_LOG_SEGMENTS, + IN_SEGMENT, + CLOSED; + } + private State state = State.UNINITIALIZED; - private ArrayList editStreams = null; + private List journals = Lists.newArrayList(); + // a monotonically increasing counter that represents transactionIds. private long txid = 0; // stores the last synced transactionId. private long synctxid = 0; + // the first txid of the log that's currently open for writing. + // If this value is N, we are currently writing to edits_inprogress_N + private long curSegmentTxId = FSConstants.INVALID_TXID; + // the time of printing the statistics to the log file. private long lastPrintTime; @@ -83,6 +103,10 @@ public class FSEditLog implements NNStorageListener { // is an automatic sync scheduled? private volatile boolean isAutoSyncScheduled = false; + + // Used to exit in the event of a failure to sync to all journals. It's a + // member variable so it can be swapped out for testing. + private Runtime runtime = Runtime.getRuntime(); // these are statistics counters. private long numTransactions; // number of transactions @@ -122,226 +146,90 @@ protected synchronized TransactionId initialValue() { FSEditLog(NNStorage storage) { isSyncRunning = false; this.storage = storage; - this.storage.registerListener(this); metrics = NameNode.getNameNodeMetrics(); lastPrintTime = now(); } - private File getEditFile(StorageDirectory sd) { - return storage.getEditFile(sd); - } - - private File getEditNewFile(StorageDirectory sd) { - return storage.getEditNewFile(sd); - } - - private int getNumEditsDirs() { - return storage.getNumStorageDirs(NameNodeDirType.EDITS); - } - - synchronized int getNumEditStreams() { - return editStreams == null ? 0 : editStreams.size(); - } - /** - * Return the currently active edit streams. - * This should be used only by unit tests. + * Initialize the list of edit journals */ - ArrayList getEditStreams() { - return editStreams; + synchronized void initJournals() { + assert journals.isEmpty(); + Preconditions.checkState(state == State.UNINITIALIZED, + "Bad state: %s", state); + + for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.EDITS)) { + journals.add(new JournalAndStream(new FileJournalManager(sd))); + } + + if (journals.isEmpty()) { + LOG.error("No edits directories configured!"); + } + + state = State.BETWEEN_LOG_SEGMENTS; } - - boolean isOpen() { - return getNumEditStreams() > 0; - } - + /** - * Create empty edit log files. - * Initialize the output stream for logging. - * - * @throws IOException + * Initialize the output stream for logging, opening the first + * log segment. */ synchronized void open() throws IOException { - numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0; - if (editStreams == null) - editStreams = new ArrayList(); - - ArrayList al = null; - for (Iterator it - = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) { - StorageDirectory sd = it.next(); - File eFile = getEditFile(sd); - try { - addNewEditLogStream(eFile); - } catch (IOException e) { - LOG.warn("Unable to open edit log file " + eFile); - // Remove the directory from list of storage directories - if(al == null) al = new ArrayList(1); - al.add(sd); - } - } - - if (al != null) - storage.reportErrorsOnDirectories(al); - - // If there was an error in every storage dir, each one will have - // been removed from the list of storage directories. - if (storage.getNumStorageDirs(NameNodeDirType.EDITS) == 0) { - throw new IOException( - "Failed to initialize edits log in any storage directory."); - } + Preconditions.checkState(state == State.UNINITIALIZED); + initJournals(); + + startLogSegment(getLastWrittenTxId() + 1, true); + assert state == State.IN_SEGMENT : "Bad state: " + state; } - - synchronized void addNewEditLogStream(File eFile) throws IOException { - EditLogOutputStream eStream = new EditLogFileOutputStream(eFile, - sizeOutputFlushBuffer); - editStreams.add(eStream); - } - - synchronized void createEditLogFile(File name) throws IOException { - waitForSyncToFinish(); - - EditLogOutputStream eStream = new EditLogFileOutputStream(name, - sizeOutputFlushBuffer); - eStream.create(); - eStream.close(); + synchronized boolean isOpen() { + return state == State.IN_SEGMENT; } /** * Shutdown the file store. */ synchronized void close() { - waitForSyncToFinish(); - if (editStreams == null || editStreams.isEmpty()) { + if (state == State.CLOSED) { + LOG.warn("Closing log when already closed", new Exception()); return; } - printStatistics(true); - numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0; - - ArrayList errorStreams = null; - Iterator it = getOutputStreamIterator(null); - while(it.hasNext()) { - EditLogOutputStream eStream = it.next(); - try { - closeStream(eStream); - } catch (IOException e) { - LOG.warn("FSEditLog:close - failed to close stream " - + eStream.getName()); - if(errorStreams == null) - errorStreams = new ArrayList(1); - errorStreams.add(eStream); - } - } - disableAndReportErrorOnStreams(errorStreams); - editStreams.clear(); - } - - /** - * Close and remove edit log stream. - * @param index of the stream - */ - synchronized private void removeStream(int index) { - EditLogOutputStream eStream = editStreams.get(index); - try { - eStream.close(); - } catch (Exception e) {} - editStreams.remove(index); - } - - /** - * The specified streams have IO errors. Close and remove them. - */ - synchronized - void disableAndReportErrorOnStreams(List errorStreams) { - if (errorStreams == null || errorStreams.size() == 0) { - return; // nothing to do - } - ArrayList errorDirs = new ArrayList(); - for (EditLogOutputStream e : errorStreams) { - if (e.getType() == JournalType.FILE) { - errorDirs.add(getStorageDirectoryForStream(e)); - } else { - disableStream(e); - } + + if (state == State.IN_SEGMENT) { + assert !journals.isEmpty(); + waitForSyncToFinish(); + endCurrentLogSegment(true); } - try { - storage.reportErrorsOnDirectories(errorDirs); - } catch (IOException ioe) { - LOG.error("Problem erroring streams " + ioe); - } - } - - - /** - * get an editStream corresponding to a sd - * @param es - stream to remove - * @return the matching stream - */ - StorageDirectory getStorage(EditLogOutputStream es) { - String parentStorageDir = ((EditLogFileOutputStream)es).getFile() - .getParentFile().getParentFile().getAbsolutePath(); - - Iterator it = storage.dirIterator(); - while (it.hasNext()) { - StorageDirectory sd = it.next(); - LOG.info("comparing: " + parentStorageDir + " and " + sd.getRoot().getAbsolutePath()); - if (parentStorageDir.equals(sd.getRoot().getAbsolutePath())) - return sd; - } - return null; - } - - /** - * get an editStream corresponding to a sd - * @param sd - * @return the matching stream - */ - synchronized EditLogOutputStream getEditsStream(StorageDirectory sd) { - for (EditLogOutputStream es : editStreams) { - File parentStorageDir = ((EditLogFileOutputStream)es).getFile() - .getParentFile().getParentFile(); - if (parentStorageDir.getName().equals(sd.getRoot().getName())) - return es; - } - return null; - } - - /** - * check if edits.new log exists in the specified stoorage directory - */ - boolean existsNew(StorageDirectory sd) { - return getEditNewFile(sd).exists(); + state = State.CLOSED; } /** * Write an operation to the edit log. Do not sync to persistent * store yet. */ - void logEdit(FSEditLogOp op) { + void logEdit(final FSEditLogOp op) { synchronized (this) { + assert state != State.CLOSED; + // wait if an automatic sync is scheduled waitIfAutoSyncScheduled(); - if(getNumEditStreams() == 0) + if (journals.isEmpty()) { throw new java.lang.IllegalStateException(NO_JOURNAL_STREAMS_WARNING); - ArrayList errorStreams = null; - long start = now(); - for(EditLogOutputStream eStream : editStreams) { - if(!eStream.isOperationSupported(op.opCode.getOpCode())) - continue; - try { - eStream.write(op); - } catch (IOException ie) { - LOG.error("logEdit: removing "+ eStream.getName(), ie); - if(errorStreams == null) - errorStreams = new ArrayList(1); - errorStreams.add(eStream); - } } - disableAndReportErrorOnStreams(errorStreams); - recordTransaction(start); + + long start = beginTransaction(); + op.setTransactionId(txid); + + mapJournalsAndReportErrors(new JournalClosure() { + @Override + public void apply(JournalAndStream jas) throws IOException { + if (!jas.isActive()) return; + jas.stream.write(op); + } + }, "logging edit"); + + endTransaction(start); // check if it is time to schedule an automatic sync if (!shouldForceSync()) { @@ -384,15 +272,18 @@ synchronized void doneWithAutoSyncScheduling() { * @return true if any of the edit stream says that it should sync */ private boolean shouldForceSync() { - for (EditLogOutputStream eStream : editStreams) { - if (eStream.shouldForceSync()) { + for (JournalAndStream jas : journals) { + if (!jas.isActive()) continue; + + if (jas.getCurrentStream().shouldForceSync()) { return true; } } return false; } - private void recordTransaction(long start) { + private long beginTransaction() { + assert Thread.holdsLock(this); // get a new transactionId txid++; @@ -401,7 +292,12 @@ private void recordTransaction(long start) { // TransactionId id = myTransactionId.get(); id.txid = txid; - + return now(); + } + + private void endTransaction(long start) { + assert Thread.holdsLock(this); + // update statistics long end = now(); numTransactions++; @@ -410,6 +306,35 @@ private void recordTransaction(long start) { metrics.addTransaction(end-start); } + /** + * Return the transaction ID of the last transaction written to the log. + */ + synchronized long getLastWrittenTxId() { + return txid; + } + + /** + * @return the first transaction ID in the current log segment + */ + synchronized long getCurSegmentTxId() { + Preconditions.checkState(state == State.IN_SEGMENT, + "Bad state: %s", state); + return curSegmentTxId; + } + + /** + * Set the transaction ID to use for the next transaction written. + */ + synchronized void setNextTxId(long nextTxId) { + Preconditions.checkArgument(synctxid <= txid && + nextTxId >= txid, + "May not decrease txid." + + " synctxid=%s txid=%s nextTxId=%s", + synctxid, txid, nextTxId); + + txid = nextTxId - 1; + } + /** * Blocks until all ongoing edits have been synced to disk. * This differs from logSync in that it waits for edits that have been @@ -457,12 +382,15 @@ void logSyncAll() throws IOException { * waitForSyncToFinish() before assuming they are running alone. */ public void logSync() { - ArrayList errorStreams = null; long syncStart = 0; // Fetch the transactionId of this thread. long mytxid = myTransactionId.get().txid; - ArrayList streams = new ArrayList(); + + List candidateJournals = + Lists.newArrayListWithCapacity(journals.size()); + List badJournals = Lists.newArrayList(); + boolean sync = false; try { synchronized (this) { @@ -493,20 +421,16 @@ public void logSync() { sync = true; // swap buffers - assert editStreams.size() > 0 : "no editlog streams"; - for(EditLogOutputStream eStream : editStreams) { + assert !journals.isEmpty() : "no editlog streams"; + + for (JournalAndStream jas : journals) { + if (!jas.isActive()) continue; try { - eStream.setReadyToFlush(); - streams.add(eStream); + jas.getCurrentStream().setReadyToFlush(); + candidateJournals.add(jas); } catch (IOException ie) { LOG.error("Unable to get ready to flush.", ie); - // - // remember the streams that encountered an error. - // - if (errorStreams == null) { - errorStreams = new ArrayList(1); - } - errorStreams.add(eStream); + badJournals.add(jas); } } } finally { @@ -517,29 +441,36 @@ public void logSync() { // do the sync long start = now(); - for (EditLogOutputStream eStream : streams) { + for (JournalAndStream jas : candidateJournals) { + if (!jas.isActive()) continue; try { - eStream.flush(); + jas.getCurrentStream().flush(); } catch (IOException ie) { LOG.error("Unable to sync edit log.", ie); // // remember the streams that encountered an error. // - if (errorStreams == null) { - errorStreams = new ArrayList(1); - } - errorStreams.add(eStream); + badJournals.add(jas); } } long elapsed = now() - start; - disableAndReportErrorOnStreams(errorStreams); + disableAndReportErrorOnJournals(badJournals); - if (metrics != null) // Metrics non-null only when used inside name node + if (metrics != null) { // Metrics non-null only when used inside name node metrics.addSync(elapsed); + } + } finally { // Prevent RuntimeException from blocking other log edit sync synchronized (this) { if (sync) { + if (badJournals.size() >= journals.size()) { + LOG.fatal("Could not sync any journal to persistent storage. " + + "Unsynced transactions: " + (txid - synctxid), + new Exception()); + runtime.exit(1); + } + synctxid = syncStart; isSyncRunning = false; } @@ -556,7 +487,7 @@ private void printStatistics(boolean force) { if (lastPrintTime + 60000 > now && !force) { return; } - if (editStreams == null || editStreams.size()==0) { + if (journals.isEmpty()) { return; } lastPrintTime = now; @@ -568,12 +499,17 @@ private void printStatistics(boolean force) { buf.append("Number of transactions batched in Syncs: "); buf.append(numTransactionsBatchedInSync); buf.append(" Number of syncs: "); - buf.append(editStreams.get(0).getNumSync()); + for (JournalAndStream jas : journals) { + if (!jas.isActive()) continue; + buf.append(jas.getCurrentStream().getNumSync()); + break; + } + buf.append(" SyncTimes(ms): "); - int numEditStreams = editStreams.size(); - for (int idx = 0; idx < numEditStreams; idx++) { - EditLogOutputStream eStream = editStreams.get(idx); + for (JournalAndStream jas : journals) { + if (!jas.isActive()) continue; + EditLogOutputStream eStream = jas.getCurrentStream(); buf.append(eStream.getTotalSyncTime()); buf.append(" "); } @@ -788,112 +724,180 @@ void logReassignLease(String leaseHolder, String src, String newHolder) { } /** - * Return the size of the current EditLog + * @return the number of active (non-failed) journals */ - synchronized long getEditLogSize() throws IOException { - assert getNumEditsDirs() <= getNumEditStreams() : - "Number of edits directories should not exceed the number of streams."; - long size = 0; - ArrayList al = null; - for (int idx = 0; idx < getNumEditStreams(); idx++) { - EditLogOutputStream es = editStreams.get(idx); - try { - long curSize = es.length(); - assert (size == 0 || size == curSize || curSize ==0) : - "Wrong streams size"; - size = Math.max(size, curSize); - } catch (IOException e) { - LOG.error("getEditLogSize: editstream.length failed. removing editlog (" + - idx + ") " + es.getName()); - if(al==null) al = new ArrayList(1); - al.add(es); + private int countActiveJournals() { + int count = 0; + for (JournalAndStream jas : journals) { + if (jas.isActive()) { + count++; } } - if(al!=null) disableAndReportErrorOnStreams(al); - return size; + return count; } /** - * Closes the current edit log and opens edits.new. + * Used only by unit tests. */ - synchronized void rollEditLog() throws IOException { - waitForSyncToFinish(); - Iterator it = storage.dirIterator(NameNodeDirType.EDITS); - if(!it.hasNext()) - return; - // - // If edits.new already exists in some directory, verify it - // exists in all directories. - // - boolean alreadyExists = existsNew(it.next()); - while(it.hasNext()) { - StorageDirectory sd = it.next(); - if(alreadyExists != existsNew(sd)) - throw new IOException(getEditNewFile(sd) - + "should " + (alreadyExists ? "" : "not ") + "exist."); - } - if(alreadyExists) - return; // nothing to do, edits.new exists! + @VisibleForTesting + List getJournals() { + return journals; + } + + /** + * Used only by unit tests. + */ + @VisibleForTesting + synchronized void setRuntimeForTesting(Runtime runtime) { + this.runtime = runtime; + } + + /** + * Return a manifest of what finalized edit logs are available + */ + public RemoteEditLogManifest getEditLogManifest(long sinceTxId) + throws IOException { + FSImageTransactionalStorageInspector inspector = + new FSImageTransactionalStorageInspector(); - // check if any of failed storage is now available and put it back + for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.EDITS)) { + inspector.inspectDirectory(sd); + } + + return inspector.getEditLogManifest(sinceTxId); + } + + /** + * Finalizes the current edit log and opens a new log segment. + * @return the transaction id of the BEGIN_LOG_SEGMENT transaction + * in the new log. + */ + synchronized long rollEditLog() throws IOException { + LOG.info("Rolling edit logs."); + endCurrentLogSegment(true); + + long nextTxId = getLastWrittenTxId() + 1; + startLogSegment(nextTxId, true); + + assert curSegmentTxId == nextTxId; + return nextTxId; + } + + /** + * Start writing to the log segment with the given txid. + * Transitions from BETWEEN_LOG_SEGMENTS state to IN_LOG_SEGMENT state. + */ + synchronized void startLogSegment(final long segmentTxId, + boolean writeHeaderTxn) throws IOException { + LOG.info("Starting log segment at " + segmentTxId); + Preconditions.checkArgument(segmentTxId > 0, + "Bad txid: %s", segmentTxId); + Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS, + "Bad state: %s", state); + Preconditions.checkState(segmentTxId > curSegmentTxId, + "Cannot start writing to log segment " + segmentTxId + + " when previous log segment started at " + curSegmentTxId); + Preconditions.checkArgument(segmentTxId == txid + 1, + "Cannot start log segment at txid %s when next expected " + + "txid is %s", segmentTxId, txid + 1); + + numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0; + + // TODO no need to link this back to storage anymore! + // See HDFS-2174. storage.attemptRestoreRemovedStorage(); - - divertFileStreams( - Storage.STORAGE_DIR_CURRENT + "/" + NameNodeFile.EDITS_NEW.getName()); - } - - /** - * Divert file streams from file edits to file edits.new.

    - * Close file streams, which are currently writing into edits files. - * Create new streams based on file getRoot()/dest. - * @param dest new stream path relative to the storage directory root. - * @throws IOException - */ - synchronized void divertFileStreams(String dest) throws IOException { - waitForSyncToFinish(); - - assert getNumEditStreams() >= getNumEditsDirs() : - "Inconsistent number of streams"; - ArrayList errorStreams = null; - EditStreamIterator itE = - (EditStreamIterator)getOutputStreamIterator(JournalType.FILE); - Iterator itD = - storage.dirIterator(NameNodeDirType.EDITS); - while(itE.hasNext() && itD.hasNext()) { - EditLogOutputStream eStream = itE.next(); - StorageDirectory sd = itD.next(); - if(!eStream.getName().startsWith(sd.getRoot().getPath())) - throw new IOException("Inconsistent order of edit streams: " + eStream); - try { - // close old stream - closeStream(eStream); - // create new stream - eStream = new EditLogFileOutputStream(new File(sd.getRoot(), dest), - sizeOutputFlushBuffer); - eStream.create(); - // replace by the new stream - itE.replace(eStream); - } catch (IOException e) { - LOG.warn("Error in editStream " + eStream.getName(), e); - if(errorStreams == null) - errorStreams = new ArrayList(1); - errorStreams.add(eStream); + + mapJournalsAndReportErrors(new JournalClosure() { + @Override + public void apply(JournalAndStream jas) throws IOException { + jas.startLogSegment(segmentTxId); } + }, "starting log segment " + segmentTxId); + + if (countActiveJournals() == 0) { + throw new IOException("Unable to start log segment " + + segmentTxId + ": no journals successfully started."); + } + + curSegmentTxId = segmentTxId; + state = State.IN_SEGMENT; + + if (writeHeaderTxn) { + logEdit(LogSegmentOp.getInstance( + FSEditLogOpCodes.OP_START_LOG_SEGMENT)); + logSync(); } - disableAndReportErrorOnStreams(errorStreams); } /** - * Removes the old edit log and renames edits.new to edits. - * Reopens the edits file. + * Finalize the current log segment. + * Transitions from IN_SEGMENT state to BETWEEN_LOG_SEGMENTS state. */ - synchronized void purgeEditLog() throws IOException { - waitForSyncToFinish(); - revertFileStreams( - Storage.STORAGE_DIR_CURRENT + "/" + NameNodeFile.EDITS_NEW.getName()); + synchronized void endCurrentLogSegment(boolean writeEndTxn) { + LOG.info("Ending log segment " + curSegmentTxId); + Preconditions.checkState(state == State.IN_SEGMENT, + "Bad state: %s", state); + + if (writeEndTxn) { + logEdit(LogSegmentOp.getInstance( + FSEditLogOpCodes.OP_END_LOG_SEGMENT)); + logSync(); + } + + printStatistics(true); + + final long lastTxId = getLastWrittenTxId(); + + mapJournalsAndReportErrors(new JournalClosure() { + @Override + public void apply(JournalAndStream jas) throws IOException { + if (jas.isActive()) { + jas.close(lastTxId); + } + } + }, "ending log segment"); + + state = State.BETWEEN_LOG_SEGMENTS; + } + + /** + * Abort all current logs. Called from the backup node. + */ + synchronized void abortCurrentLogSegment() { + mapJournalsAndReportErrors(new JournalClosure() { + + @Override + public void apply(JournalAndStream jas) throws IOException { + jas.abort(); + } + }, "aborting all streams"); + state = State.BETWEEN_LOG_SEGMENTS; } + /** + * Archive any log files that are older than the given txid. + */ + public void purgeLogsOlderThan( + final long minTxIdToKeep, final StoragePurger purger) { + synchronized (this) { + // synchronized to prevent findbugs warning about inconsistent + // synchronization. This will be JIT-ed out if asserts are + // off. + assert curSegmentTxId == FSConstants.INVALID_TXID || // on format this is no-op + minTxIdToKeep <= curSegmentTxId : + "cannot purge logs older than txid " + minTxIdToKeep + + " when current segment starts at " + curSegmentTxId; + } + + mapJournalsAndReportErrors(new JournalClosure() { + @Override + public void apply(JournalAndStream jas) throws IOException { + jas.manager.purgeLogsOlderThan(minTxIdToKeep, purger); + } + }, "purging logs older than " + minTxIdToKeep); + } + /** * The actual sync activity happens while not synchronized on this object. * Thus, synchronized activities that require that they are not concurrent @@ -907,85 +911,6 @@ synchronized void waitForSyncToFinish() { } } - /** - * Revert file streams from file edits.new back to file edits.

    - * Close file streams, which are currently writing into getRoot()/source. - * Rename getRoot()/source to edits. - * Reopen streams so that they start writing into edits files. - * @param dest new stream path relative to the storage directory root. - * @throws IOException - */ - synchronized void revertFileStreams(String source) throws IOException { - waitForSyncToFinish(); - - assert getNumEditStreams() >= getNumEditsDirs() : - "Inconsistent number of streams"; - ArrayList errorStreams = null; - EditStreamIterator itE = - (EditStreamIterator)getOutputStreamIterator(JournalType.FILE); - Iterator itD = - storage.dirIterator(NameNodeDirType.EDITS); - while(itE.hasNext() && itD.hasNext()) { - EditLogOutputStream eStream = itE.next(); - StorageDirectory sd = itD.next(); - if(!eStream.getName().startsWith(sd.getRoot().getPath())) - throw new IOException("Inconsistent order of edit streams: " + eStream + - " does not start with " + sd.getRoot().getPath()); - try { - // close old stream - closeStream(eStream); - // rename edits.new to edits - File editFile = getEditFile(sd); - File prevEditFile = new File(sd.getRoot(), source); - if(prevEditFile.exists()) { - if(!prevEditFile.renameTo(editFile)) { - // - // renameTo() fails on Windows if the destination - // file exists. - // - if(!editFile.delete() || !prevEditFile.renameTo(editFile)) { - throw new IOException("Rename failed for " + sd.getRoot()); - } - } - } - // open new stream - eStream = new EditLogFileOutputStream(editFile, sizeOutputFlushBuffer); - // replace by the new stream - itE.replace(eStream); - } catch (IOException e) { - LOG.warn("Error in editStream " + eStream.getName(), e); - if(errorStreams == null) - errorStreams = new ArrayList(1); - errorStreams.add(eStream); - } - } - disableAndReportErrorOnStreams(errorStreams); - } - - /** - * Return the name of the edit file - */ - synchronized File getFsEditName() { - StorageDirectory sd = null; - for (Iterator it = - storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) { - sd = it.next(); - if(sd.getRoot().canRead()) - return getEditFile(sd); - } - return null; - } - - /** - * Returns the timestamp of the edit log - */ - synchronized long getFsEditTime() { - Iterator it = storage.dirIterator(NameNodeDirType.EDITS); - if(it.hasNext()) - return getEditFile(it.next()).lastModified(); - return 0; - } - /** * Return the txid of the last synced transaction. * For test use only @@ -996,273 +921,224 @@ synchronized long getSyncTxId() { // sets the initial capacity of the flush buffer. - public void setBufferCapacity(int size) { - sizeOutputFlushBuffer = size; - } - - - boolean isEmpty() throws IOException { - return getEditLogSize() <= 0; + public void setOutputBufferCapacity(int size) { + for (JournalAndStream jas : journals) { + jas.manager.setOutputBufferCapacity(size); + } } /** * Create (or find if already exists) an edit output stream, which * streams journal records (edits) to the specified backup node.
    - * Send a record, prescribing to start journal spool.
    - * This should be sent via regular stream of journal records so that - * the backup node new exactly after which record it should start spooling. + * + * The new BackupNode will start receiving edits the next time this + * NameNode's logs roll. * * @param bnReg the backup node registration information. * @param nnReg this (active) name-node registration. * @throws IOException */ - synchronized void logJSpoolStart(NamenodeRegistration bnReg, // backup node - NamenodeRegistration nnReg) // active name-node + synchronized void registerBackupNode( + NamenodeRegistration bnReg, // backup node + NamenodeRegistration nnReg) // active name-node throws IOException { if(bnReg.isRole(NamenodeRole.CHECKPOINT)) return; // checkpoint node does not stream edits - if(editStreams == null) - editStreams = new ArrayList(); - EditLogOutputStream boStream = null; - for(EditLogOutputStream eStream : editStreams) { - if(eStream.getName().equals(bnReg.getAddress())) { - boStream = eStream; // already there - break; + + JournalAndStream jas = findBackupJournalAndStream(bnReg); + if (jas != null) { + // already registered + LOG.info("Backup node " + bnReg + " re-registers"); + return; + } + + LOG.info("Registering new backup node: " + bnReg); + BackupJournalManager bjm = new BackupJournalManager(bnReg, nnReg); + journals.add(new JournalAndStream(bjm)); + } + + synchronized void releaseBackupStream(NamenodeRegistration registration) { + for (Iterator iter = journals.iterator(); + iter.hasNext();) { + JournalAndStream jas = iter.next(); + if (jas.manager instanceof BackupJournalManager && + ((BackupJournalManager)jas.manager).matchesRegistration( + registration)) { + jas.abort(); + LOG.info("Removing backup journal " + jas); + iter.remove(); } } - if(boStream == null) { - boStream = new EditLogBackupOutputStream(bnReg, nnReg); - editStreams.add(boStream); + } + + /** + * Find the JournalAndStream associated with this BackupNode. + * @return null if it cannot be found + */ + private synchronized JournalAndStream findBackupJournalAndStream( + NamenodeRegistration bnReg) { + for (JournalAndStream jas : journals) { + if (jas.manager instanceof BackupJournalManager) { + BackupJournalManager bjm = (BackupJournalManager)jas.manager; + if (bjm.matchesRegistration(bnReg)) { + return jas; + } + } } - logEdit(JSpoolStartOp.getInstance()); + return null; } /** * Write an operation to the edit log. Do not sync to persistent * store yet. + */ + synchronized void logEdit(final int length, final byte[] data) { + long start = beginTransaction(); + + mapJournalsAndReportErrors(new JournalClosure() { + @Override + public void apply(JournalAndStream jas) throws IOException { + if (jas.isActive()) { + jas.getCurrentStream().writeRaw(data, 0, length); // TODO writeRaw + } + } + }, "Logging edit"); + + endTransaction(start); + } + + //// Iteration across journals + private interface JournalClosure { + public void apply(JournalAndStream jas) throws IOException; + } + + /** + * Apply the given function across all of the journal managers, disabling + * any for which the closure throws an IOException. + * @param status message used for logging errors (e.g. "opening journal") */ - synchronized void logEdit(int length, byte[] data) { - if(getNumEditStreams() == 0) - throw new java.lang.IllegalStateException(NO_JOURNAL_STREAMS_WARNING); - ArrayList errorStreams = null; - long start = now(); - for(EditLogOutputStream eStream : editStreams) { + private void mapJournalsAndReportErrors( + JournalClosure closure, String status) { + List badJAS = Lists.newLinkedList(); + for (JournalAndStream jas : journals) { try { - eStream.writeRaw(data, 0, length); - } catch (IOException ie) { - LOG.warn("Error in editStream " + eStream.getName(), ie); - if(errorStreams == null) - errorStreams = new ArrayList(1); - errorStreams.add(eStream); - } - } - disableAndReportErrorOnStreams(errorStreams); - recordTransaction(start); - } - - /** - * Iterates output streams based of the same type. - * Type null will iterate over all streams. - */ - private class EditStreamIterator implements Iterator { - JournalType type; - int prevIndex; // for remove() - int nextIndex; // for next() - - EditStreamIterator(JournalType streamType) { - this.type = streamType; - this.nextIndex = 0; - this.prevIndex = 0; - } - - public boolean hasNext() { - synchronized(FSEditLog.this) { - if(editStreams == null || - editStreams.isEmpty() || nextIndex >= editStreams.size()) - return false; - while(nextIndex < editStreams.size() - && !editStreams.get(nextIndex).getType().isOfType(type)) - nextIndex++; - return nextIndex < editStreams.size(); + closure.apply(jas); + } catch (Throwable t) { + LOG.error("Error " + status + " (journal " + jas + ")", t); + badJAS.add(jas); } } - public EditLogOutputStream next() { - EditLogOutputStream stream = null; - synchronized(FSEditLog.this) { - stream = editStreams.get(nextIndex); - prevIndex = nextIndex; - nextIndex++; - while(nextIndex < editStreams.size() - && !editStreams.get(nextIndex).getType().isOfType(type)) - nextIndex++; - } - return stream; - } - - public void remove() { - nextIndex = prevIndex; // restore previous state - removeStream(prevIndex); // remove last returned element - hasNext(); // reset nextIndex to correct place - } - - void replace(EditLogOutputStream newStream) { - synchronized (FSEditLog.this) { - assert 0 <= prevIndex && prevIndex < editStreams.size() : - "Index out of bound."; - editStreams.set(prevIndex, newStream); - } - } - } - - /** - * Get stream iterator for the specified type. - */ - public Iterator - getOutputStreamIterator(JournalType streamType) { - return new EditStreamIterator(streamType); - } - - private void closeStream(EditLogOutputStream eStream) throws IOException { - eStream.setReadyToFlush(); - eStream.flush(); - eStream.close(); - } - - void incrementCheckpointTime() { - storage.incrementCheckpointTime(); - CheckpointTimeOp op = CheckpointTimeOp.getInstance() - .setCheckpointTime(storage.getCheckpointTime()); - logEdit(op); - } - - synchronized void releaseBackupStream(NamenodeRegistration registration) { - Iterator it = - getOutputStreamIterator(JournalType.BACKUP); - ArrayList errorStreams = null; - NamenodeRegistration backupNode = null; - while(it.hasNext()) { - EditLogBackupOutputStream eStream = (EditLogBackupOutputStream)it.next(); - backupNode = eStream.getRegistration(); - if(backupNode.getAddress().equals(registration.getAddress()) && - backupNode.isRole(registration.getRole())) { - errorStreams = new ArrayList(1); - errorStreams.add(eStream); - break; - } - } - assert backupNode == null || backupNode.isRole(NamenodeRole.BACKUP) : - "Not a backup node corresponds to a backup stream"; - disableAndReportErrorOnStreams(errorStreams); - } - - synchronized boolean checkBackupRegistration( - NamenodeRegistration registration) { - Iterator it = - getOutputStreamIterator(JournalType.BACKUP); - boolean regAllowed = !it.hasNext(); - NamenodeRegistration backupNode = null; - ArrayList errorStreams = null; - while(it.hasNext()) { - EditLogBackupOutputStream eStream = (EditLogBackupOutputStream)it.next(); - backupNode = eStream.getRegistration(); - if(backupNode.getAddress().equals(registration.getAddress()) && - backupNode.isRole(registration.getRole())) { - regAllowed = true; // same node re-registers - break; - } - if(!eStream.isAlive()) { - if(errorStreams == null) - errorStreams = new ArrayList(1); - errorStreams.add(eStream); - regAllowed = true; // previous backup node failed - } - } - assert backupNode == null || backupNode.isRole(NamenodeRole.BACKUP) : - "Not a backup node corresponds to a backup stream"; - disableAndReportErrorOnStreams(errorStreams); - return regAllowed; + disableAndReportErrorOnJournals(badJAS); } - /** - * Get the StorageDirectory for a stream - * @param es Stream whose StorageDirectory we wish to know - * @return the matching StorageDirectory + * Called when some journals experience an error in some operation. + * This propagates errors to the storage level. */ - StorageDirectory getStorageDirectoryForStream(EditLogOutputStream es) { - String parentStorageDir = ((EditLogFileOutputStream)es).getFile().getParentFile().getParentFile().getAbsolutePath(); - - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - FSNamesystem.LOG.info("comparing: " + parentStorageDir - + " and " + sd.getRoot().getAbsolutePath()); - if (parentStorageDir.equals(sd.getRoot().getAbsolutePath())) - return sd; + private void disableAndReportErrorOnJournals(List badJournals) { + if (badJournals == null || badJournals.isEmpty()) { + return; // nothing to do } - return null; - } - - private synchronized void disableStream(EditLogOutputStream stream) { - try { stream.close(); } catch (IOException e) { - // nothing to do. - LOG.warn("Failed to close eStream " + stream.getName() - + " before removing it (might be ok)"); - } - editStreams.remove(stream); - - if (editStreams.size() <= 0) { - String msg = "Fatal Error: All storage directories are inaccessible."; - LOG.fatal(msg, new IOException(msg)); - Runtime.getRuntime().exit(-1); + + for (JournalAndStream j : badJournals) { + LOG.error("Disabling journal " + j); + j.abort(); } } /** - * Error Handling on a storageDirectory - * + * Container for a JournalManager paired with its currently + * active stream. + * + * If a Journal gets disabled due to an error writing to its + * stream, then the stream will be aborted and set to null. */ - // NNStorageListener Interface - @Override // NNStorageListener - public synchronized void errorOccurred(StorageDirectory sd) + static class JournalAndStream { + private final JournalManager manager; + private EditLogOutputStream stream; + private long segmentStartsAtTxId = FSConstants.INVALID_TXID; + + private JournalAndStream(JournalManager manager) { + this.manager = manager; + } + + private void startLogSegment(long txId) throws IOException { + Preconditions.checkState(stream == null); + stream = manager.startLogSegment(txId); + segmentStartsAtTxId = txId; + } + + private void close(long lastTxId) throws IOException { + Preconditions.checkArgument(lastTxId >= segmentStartsAtTxId, + "invalid segment: lastTxId %s >= " + + "segment starting txid %s", lastTxId, segmentStartsAtTxId); + + if (stream == null) return; + stream.close(); + manager.finalizeLogSegment(segmentStartsAtTxId, lastTxId); + stream = null; + } + + private void abort() { + if (stream == null) return; + try { + stream.abort(); + } catch (IOException ioe) { + LOG.error("Unable to abort stream " + stream, ioe); + } + stream = null; + segmentStartsAtTxId = FSConstants.INVALID_TXID; + } + + private boolean isActive() { + return stream != null; + } + + @VisibleForTesting + EditLogOutputStream getCurrentStream() { + return stream; + } + + @Override + public String toString() { + return "JournalAndStream(mgr=" + manager + + ", " + "stream=" + stream + ")"; + } + + @VisibleForTesting + void setCurrentStreamForTests(EditLogOutputStream stream) { + this.stream = stream; + } + + @VisibleForTesting + JournalManager getManager() { + return manager; + } + + private EditLogInputStream getInProgressInputStream() throws IOException { + return manager.getInProgressInputStream(segmentStartsAtTxId); + } + } + + /** + * @return an EditLogInputStream that reads from the same log that + * the edit log is currently writing. This is used from the BackupNode + * during edits synchronization. + * @throws IOException if no valid logs are available. + */ + synchronized EditLogInputStream getInProgressFileInputStream() throws IOException { - if (editStreams == null) { - //errors can occur on storage directories - //before edit streams have been set up - return; - } - ArrayList errorStreams - = new ArrayList(); - - for (EditLogOutputStream eStream : editStreams) { - LOG.error("Unable to log edits to " + eStream.getName() - + "; removing it"); - - StorageDirectory streamStorageDir = getStorageDirectoryForStream(eStream); - if (sd == streamStorageDir) { - errorStreams.add(eStream); + for (JournalAndStream jas : journals) { + if (!jas.isActive()) continue; + try { + EditLogInputStream in = jas.getInProgressInputStream(); + if (in != null) return in; + } catch (IOException ioe) { + LOG.warn("Unable to get the in-progress input stream from " + jas, + ioe); } } - - for (EditLogOutputStream eStream : errorStreams) { - disableStream(eStream); - } - } - - @Override // NNStorageListener - public synchronized void formatOccurred(StorageDirectory sd) - throws IOException { - if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { - createEditLogFile(NNStorage.getStorageFile(sd, NameNodeFile.EDITS)); - } - }; - - @Override // NNStorageListener - public synchronized void directoryAvailable(StorageDirectory sd) - throws IOException { - if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { - File eFile = getEditFile(sd); - addNewEditLogStream(eFile); - } + throw new IOException("No in-progress stream provided edits"); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java index d8ecaf4e2ec..ebe19cfb08a 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java @@ -21,12 +21,12 @@ import java.io.BufferedInputStream; import java.io.DataInputStream; -import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; import java.io.FilterInputStream; import java.io.IOException; import java.io.InputStream; import java.util.Arrays; -import java.util.zip.CheckedInputStream; import java.util.zip.Checksum; import org.apache.hadoop.fs.permission.PermissionStatus; @@ -37,6 +37,8 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction; import org.apache.hadoop.hdfs.server.common.Storage; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.LogHeader; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.Reader; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp; import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp; @@ -71,75 +73,42 @@ public FSEditLogLoader(FSNamesystem fsNamesys) { * This is where we apply edits that we've been writing to disk all * along. */ - int loadFSEdits(EditLogInputStream edits) throws IOException { + int loadFSEdits(EditLogInputStream edits, long expectedStartingTxId) + throws IOException { long startTime = now(); - int numEdits = loadFSEdits(edits, true); + int numEdits = loadFSEdits(edits, true, expectedStartingTxId); FSImage.LOG.info("Edits file " + edits.getName() + " of size " + edits.length() + " edits # " + numEdits + " loaded in " + (now()-startTime)/1000 + " seconds."); return numEdits; } - /** - * Read the header of fsedit log - * @param in fsedit stream - * @return the edit log version number - * @throws IOException if error occurs - */ - int readLogVersion(DataInputStream in) throws IOException { - int logVersion = 0; - // Read log file version. Could be missing. - in.mark(4); - // If edits log is greater than 2G, available method will return negative - // numbers, so we avoid having to call available - boolean available = true; - try { - logVersion = in.readByte(); - } catch (EOFException e) { - available = false; - } - if (available) { - in.reset(); - logVersion = in.readInt(); - if (logVersion < FSConstants.LAYOUT_VERSION) // future version - throw new IOException( - "Unexpected version of the file system log file: " - + logVersion + ". Current version = " - + FSConstants.LAYOUT_VERSION + "."); - } - assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION : - "Unsupported version " + logVersion; - return logVersion; - } - - int loadFSEdits(EditLogInputStream edits, boolean closeOnExit) throws IOException { + int loadFSEdits(EditLogInputStream edits, boolean closeOnExit, + long expectedStartingTxId) + throws IOException { BufferedInputStream bin = new BufferedInputStream(edits); DataInputStream in = new DataInputStream(bin); int numEdits = 0; - int logVersion = 0; try { - logVersion = readLogVersion(in); - Checksum checksum = null; - if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) { - checksum = FSEditLog.getChecksum(); - in = new DataInputStream(new CheckedInputStream(bin, checksum)); - } - - numEdits = loadEditRecords(logVersion, in, checksum, false); + LogHeader header = LogHeader.read(in); + numEdits = loadEditRecords( + header.logVersion, in, header.checksum, false, + expectedStartingTxId); } finally { if(closeOnExit) in.close(); } - if (logVersion != FSConstants.LAYOUT_VERSION) // other version - numEdits++; // save this image asap + return numEdits; } @SuppressWarnings("deprecation") int loadEditRecords(int logVersion, DataInputStream in, - Checksum checksum, boolean closeOnExit) throws IOException { + Checksum checksum, boolean closeOnExit, + long expectedStartingTxId) + throws IOException { FSDirectory fsDir = fsNamesys.dir; int numEdits = 0; @@ -162,6 +131,8 @@ int loadEditRecords(int logVersion, DataInputStream in, Arrays.fill(recentOpcodeOffsets, -1); try { + long txId = expectedStartingTxId - 1; + try { FSEditLogOp.Reader reader = new FSEditLogOp.Reader(in, logVersion, checksum); @@ -169,6 +140,15 @@ int loadEditRecords(int logVersion, DataInputStream in, while ((op = reader.readOp()) != null) { recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] = tracker.getPos(); + if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) { + long thisTxId = op.txid; + if (thisTxId != txId + 1) { + throw new IOException("Expected transaction ID " + + (txId + 1) + " but got " + thisTxId); + } + txId = thisTxId; + } + numEdits++; switch (op.opCode) { case OP_ADD: @@ -417,6 +397,12 @@ int loadEditRecords(int logVersion, DataInputStream in, reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile); break; } + case OP_START_LOG_SEGMENT: + case OP_END_LOG_SEGMENT: { + // no data in here currently. + numOpOther++; + break; + } case OP_DATANODE_ADD: case OP_DATANODE_REMOVE: numOpOther++; @@ -494,6 +480,61 @@ private void check203UpgradeFailure(int logVersion, IOException ex) } } + /** + * Return the number of valid transactions in the file. If the file is + * truncated during the header, returns a value indicating that there are + * 0 valid transactions. + * @throws IOException if the file cannot be read due to an IO error (eg + * if the log does not exist) + */ + static EditLogValidation validateEditLog(File f) throws IOException { + FileInputStream fis = new FileInputStream(f); + try { + PositionTrackingInputStream tracker = new PositionTrackingInputStream( + new BufferedInputStream(fis)); + DataInputStream dis = new DataInputStream(tracker); + LogHeader header; + try { + header = LogHeader.read(dis); + } catch (Throwable t) { + FSImage.LOG.debug("Unable to read header from " + f + + " -> no valid transactions in this file."); + return new EditLogValidation(0, 0); + } + + Reader reader = new FSEditLogOp.Reader(dis, header.logVersion, header.checksum); + long numValid = 0; + long lastPos = 0; + try { + while (true) { + lastPos = tracker.getPos(); + if (reader.readOp() == null) { + break; + } + numValid++; + } + } catch (Throwable t) { + // Catch Throwable and not just IOE, since bad edits may generate + // NumberFormatExceptions, AssertionErrors, OutOfMemoryErrors, etc. + FSImage.LOG.debug("Caught exception after reading " + numValid + + " ops from " + f + " while determining its valid length.", t); + } + return new EditLogValidation(lastPos, numValid); + } finally { + fis.close(); + } + } + + static class EditLogValidation { + long validLength; + long numTransactions; + + EditLogValidation(long validLength, long numTransactions) { + this.validLength = validLength; + this.numTransactions = numTransactions; + } + } + /** * Stream wrapper that keeps track of the current file position. */ diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java index fb6e65f23c2..00ce353b9ac 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hdfs.server.namenode; +import java.util.zip.CheckedInputStream; import java.util.zip.Checksum; import java.util.EnumMap; @@ -29,10 +30,12 @@ import org.apache.hadoop.fs.permission.PermissionStatus; import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.DatanodeID; +import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.protocol.LayoutVersion; import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; -import org.apache.hadoop.hdfs.server.common.GenerationStamp; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor; +import org.apache.hadoop.hdfs.server.common.GenerationStamp; +import org.apache.hadoop.hdfs.server.common.Storage; import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.*; import org.apache.hadoop.security.token.delegation.DelegationKey; @@ -61,6 +64,8 @@ @InterfaceStability.Unstable public abstract class FSEditLogOp { final FSEditLogOpCodes opCode; + long txid; + @SuppressWarnings("deprecation") private static ThreadLocal> opInstances = @@ -93,8 +98,10 @@ protected EnumMap initialValue() { instances.put(OP_CANCEL_DELEGATION_TOKEN, new CancelDelegationTokenOp()); instances.put(OP_UPDATE_MASTER_KEY, new UpdateMasterKeyOp()); - instances.put(OP_CHECKPOINT_TIME, new CheckpointTimeOp()); - instances.put(OP_JSPOOL_START, new JSpoolStartOp()); + instances.put(OP_START_LOG_SEGMENT, + new LogSegmentOp(OP_START_LOG_SEGMENT)); + instances.put(OP_END_LOG_SEGMENT, + new LogSegmentOp(OP_END_LOG_SEGMENT)); return instances; } }; @@ -105,6 +112,11 @@ protected EnumMap initialValue() { */ private FSEditLogOp(FSEditLogOpCodes opCode) { this.opCode = opCode; + this.txid = 0; + } + + public void setTransactionId(long txid) { + this.txid = txid; } abstract void readFields(DataInputStream in, int logVersion) @@ -1207,6 +1219,28 @@ void readFields(DataInputStream in, int logVersion) this.key.readFields(in); } } + + static class LogSegmentOp extends FSEditLogOp { + private LogSegmentOp(FSEditLogOpCodes code) { + super(code); + assert code == OP_START_LOG_SEGMENT || + code == OP_END_LOG_SEGMENT : "Bad op: " + code; + } + + static LogSegmentOp getInstance(FSEditLogOpCodes code) { + return (LogSegmentOp)opInstances.get().get(code); + } + + public void readFields(DataInputStream in, int logVersion) + throws IOException { + // no data stored in these ops yet + } + + @Override + void writeFields(DataOutputStream out) throws IOException { + // no data stored + } + } static class InvalidOp extends FSEditLogOp { private InvalidOp() { @@ -1228,54 +1262,6 @@ void readFields(DataInputStream in, int logVersion) } } - static class JSpoolStartOp extends FSEditLogOp { - private JSpoolStartOp() { - super(OP_JSPOOL_START); - } - - static JSpoolStartOp getInstance() { - return (JSpoolStartOp)opInstances.get().get(OP_JSPOOL_START); - } - - @Override - void writeFields(DataOutputStream out) throws IOException { - } - - @Override - void readFields(DataInputStream in, int logVersion) - throws IOException { - } - } - - static class CheckpointTimeOp extends FSEditLogOp { - long checkpointTime; - - private CheckpointTimeOp() { - super(OP_CHECKPOINT_TIME); - } - - CheckpointTimeOp setCheckpointTime(long time) { - this.checkpointTime = time; - return this; - } - - static CheckpointTimeOp getInstance() { - return (CheckpointTimeOp)opInstances.get() - .get(OP_CHECKPOINT_TIME); - } - - @Override - void writeFields(DataOutputStream out) throws IOException { - new LongWritable(checkpointTime).write(out); - } - - @Override - void readFields(DataInputStream in, int logVersion) - throws IOException { - this.checkpointTime = readLong(in); - } - } - static private short readShort(DataInputStream in) throws IOException { return Short.parseShort(FSImageSerialization.readString(in)); } @@ -1337,6 +1323,62 @@ private static long readLongWritable(DataInputStream in) throws IOException { return longWritable.get(); } } + + /** + * Class to encapsulate the header at the top of a log file. + */ + static class LogHeader { + final int logVersion; + final Checksum checksum; + + public LogHeader(int logVersion, Checksum checksum) { + this.logVersion = logVersion; + this.checksum = checksum; + } + + static LogHeader read(DataInputStream in) throws IOException { + int logVersion = 0; + + logVersion = FSEditLogOp.LogHeader.readLogVersion(in); + Checksum checksum = null; + if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) { + checksum = FSEditLog.getChecksum(); + } + return new LogHeader(logVersion, checksum); + } + + /** + * Read the header of fsedit log + * @param in fsedit stream + * @return the edit log version number + * @throws IOException if error occurs + */ + private static int readLogVersion(DataInputStream in) throws IOException { + int logVersion = 0; + // Read log file version. Could be missing. + in.mark(4); + // If edits log is greater than 2G, available method will return negative + // numbers, so we avoid having to call available + boolean available = true; + try { + logVersion = in.readByte(); + } catch (EOFException e) { + available = false; + } + if (available) { + in.reset(); + logVersion = in.readInt(); + if (logVersion < FSConstants.LAYOUT_VERSION) // future version + throw new IOException( + "Unexpected version of the file system log file: " + + logVersion + ". Current version = " + + FSConstants.LAYOUT_VERSION + "."); + } + assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION : + "Unsupported version " + logVersion; + return logVersion; + } + } /** * Class for writing editlog ops @@ -1357,6 +1399,7 @@ public Writer(DataOutputBuffer out) { public void writeOp(FSEditLogOp op) throws IOException { int start = buf.getLength(); buf.writeByte(op.opCode.getOpCode()); + buf.writeLong(op.txid); op.writeFields(buf); int end = buf.getLength(); Checksum checksum = FSEditLog.getChecksum(); @@ -1384,7 +1427,12 @@ public static class Reader { @SuppressWarnings("deprecation") public Reader(DataInputStream in, int logVersion, Checksum checksum) { - this.in = in; + if (checksum != null) { + this.in = new DataInputStream( + new CheckedInputStream(in, checksum)); + } else { + this.in = in; + } this.logVersion = logVersion; this.checksum = checksum; } @@ -1423,9 +1471,15 @@ public FSEditLogOp readOp() throws IOException { if (op == null) { throw new IOException("Read invalid opcode " + opCode); } + + if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) { + // Read the txid + op.setTransactionId(in.readLong()); + } + op.readFields(in, logVersion); - validateChecksum(in, checksum); + validateChecksum(in, checksum, op.txid); return op; } @@ -1433,7 +1487,8 @@ public FSEditLogOp readOp() throws IOException { * Validate a transaction's checksum */ private void validateChecksum(DataInputStream in, - Checksum checksum) + Checksum checksum, + long txid) throws IOException { if (checksum != null) { int calculatedChecksum = (int)checksum.getValue(); @@ -1441,7 +1496,7 @@ private void validateChecksum(DataInputStream in, if (readChecksum != calculatedChecksum) { throw new ChecksumException( "Transaction is corrupt. Calculated checksum is " + - calculatedChecksum + " but read checksum " + readChecksum, -1); + calculatedChecksum + " but read checksum " + readChecksum, txid); } } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java index 3c59b67fe62..220c267f085 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java @@ -54,10 +54,8 @@ public enum FSEditLogOpCodes { OP_CANCEL_DELEGATION_TOKEN ((byte) 20), OP_UPDATE_MASTER_KEY ((byte) 21), OP_REASSIGN_LEASE ((byte) 22), - // must be same as NamenodeProtocol.JA_JSPOOL_START - OP_JSPOOL_START ((byte)102), - // must be same as NamenodeProtocol.JA_CHECKPOINT_TIME - OP_CHECKPOINT_TIME ((byte)103); + OP_END_LOG_SEGMENT ((byte) 23), + OP_START_LOG_SEGMENT ((byte) 24); private byte opCode; diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java index afa347c44f4..993dd8cd44e 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java @@ -21,17 +21,13 @@ import java.io.File; import java.io.IOException; import java.net.URI; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Date; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; -import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -46,42 +42,45 @@ import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.Storage.StorageState; -import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.common.Util; import static org.apache.hadoop.hdfs.server.common.Util.now; import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; +import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.LoadPlan; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NNStorageListener; import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; +import org.apache.hadoop.hdfs.util.MD5FileUtils; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.hdfs.DFSConfigKeys; +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + /** * FSImage handles checkpointing and logging of the namespace edits. * */ @InterfaceAudience.Private @InterfaceStability.Evolving -public class FSImage implements NNStorageListener, Closeable { +public class FSImage implements Closeable { protected static final Log LOG = LogFactory.getLog(FSImage.class.getName()); - private static final SimpleDateFormat DATE_FORM = - new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); - - // checkpoint states - enum CheckpointStates{START, ROLLED_EDITS, UPLOAD_START, UPLOAD_DONE; } - protected FSNamesystem namesystem = null; protected FSEditLog editLog = null; private boolean isUpgradeFinalized = false; - protected MD5Hash newImageDigest = null; - protected NNStorage storage = null; + protected NNStorage storage; + + /** + * The last transaction ID that was either loaded from an image + * or loaded by loading edits files. + */ + protected long lastAppliedTxId = 0; /** * URIs for importing an image from a checkpoint. In the default case, @@ -90,75 +89,69 @@ enum CheckpointStates{START, ROLLED_EDITS, UPLOAD_START, UPLOAD_DONE; } private Collection checkpointDirs; private Collection checkpointEditsDirs; - private Configuration conf; + final private Configuration conf; + + private final NNStorageRetentionManager archivalManager; /** - * Can fs-image be rolled? - */ - volatile protected CheckpointStates ckptState = FSImage.CheckpointStates.START; - - /** - */ - FSImage() { - this((FSNamesystem)null); - } - - /** - * Constructor + * Construct an FSImage. * @param conf Configuration + * @see #FSImage(Configuration conf, FSNamesystem ns, + * Collection imageDirs, Collection editsDirs) + * @throws IOException if default directories are invalid. */ - FSImage(Configuration conf) throws IOException { - this(); - this.conf = conf; // TODO we have too many constructors, this is a mess - - if(conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY, - DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT)) { - NameNode.LOG.info("set FSImage.restoreFailedStorage"); - storage.setRestoreFailedStorage(true); - } - setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null), - FSImage.getCheckpointEditsDirs(conf, null)); + public FSImage(Configuration conf) throws IOException { + this(conf, (FSNamesystem)null); } - private FSImage(FSNamesystem ns) { - this.conf = new Configuration(); - - storage = new NNStorage(conf); + /** + * Construct an FSImage + * @param conf Configuration + * @param ns The FSNamesystem using this image. + * @see #FSImage(Configuration conf, FSNamesystem ns, + * Collection imageDirs, Collection editsDirs) + * @throws IOException if default directories are invalid. + */ + private FSImage(Configuration conf, FSNamesystem ns) throws IOException { + this(conf, ns, + FSNamesystem.getNamespaceDirs(conf), + FSNamesystem.getNamespaceEditsDirs(conf)); + } + + /** + * Construct the FSImage. Set the default checkpoint directories. + * + * Setup storage and initialize the edit log. + * + * @param conf Configuration + * @param ns The FSNamesystem using this image. + * @param imageDirs Directories the image can be stored in. + * @param editsDirs Directories the editlog can be stored in. + * @throws IOException if directories are invalid. + */ + protected FSImage(Configuration conf, FSNamesystem ns, + Collection imageDirs, Collection editsDirs) + throws IOException { + this.conf = conf; + setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null), + FSImage.getCheckpointEditsDirs(conf, null)); + + storage = new NNStorage(conf, imageDirs, editsDirs); if (ns != null) { storage.setUpgradeManager(ns.upgradeManager); } - storage.registerListener(this); + + if(conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY, + DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT)) { + storage.setRestoreFailedStorage(true); + } this.editLog = new FSEditLog(storage); setFSNamesystem(ns); + + archivalManager = new NNStorageRetentionManager(conf, storage, editLog); } - /** - * @throws IOException - */ - FSImage(Collection fsDirs, Collection fsEditsDirs) - throws IOException { - this(); - storage.setStorageDirectories(fsDirs, fsEditsDirs); - } - - public FSImage(StorageInfo storageInfo, String bpid) { - storage = new NNStorage(storageInfo, bpid); - } - - /** - * Represents an Image (image and edit file). - * @throws IOException - */ - FSImage(URI imageDir) throws IOException { - this(); - ArrayList dirs = new ArrayList(1); - ArrayList editsDirs = new ArrayList(1); - dirs.add(imageDir); - editsDirs.add(imageDir); - storage.setStorageDirectories(dirs, editsDirs); - } - protected FSNamesystem getFSNamesystem() { return namesystem; } @@ -169,33 +162,37 @@ void setFSNamesystem(FSNamesystem ns) { storage.setUpgradeManager(ns.upgradeManager); } } - + void setCheckpointDirectories(Collection dirs, Collection editsDirs) { checkpointDirs = dirs; checkpointEditsDirs = editsDirs; } + void format(String clusterId) throws IOException { + storage.format(clusterId); + saveFSImageInAllDirs(0); + } + /** * Analyze storage directories. * Recover from previous transitions if required. * Perform fs state transition if necessary depending on the namespace info. * Read storage info. * - * @param dataDirs - * @param startOpt startup option * @throws IOException * @return true if the image needs to be saved or false otherwise */ - boolean recoverTransitionRead(Collection dataDirs, - Collection editsDirs, - StartupOption startOpt) + boolean recoverTransitionRead(StartupOption startOpt) throws IOException { assert startOpt != StartupOption.FORMAT : "NameNode formatting should be performed before reading the image"; + Collection imageDirs = storage.getImageDirectories(); + Collection editsDirs = storage.getEditsDirectories(); + // none of the data dirs exist - if((dataDirs.size() == 0 || editsDirs.size() == 0) + if((imageDirs.size() == 0 || editsDirs.size() == 0) && startOpt != StartupOption.IMPORT) throw new IOException( "All specified directories are not accessible or do not exist."); @@ -210,50 +207,24 @@ boolean recoverTransitionRead(Collection dataDirs, throw new IOException("Cannot import image from a checkpoint. " + "\"dfs.namenode.checkpoint.dir\" is not set." ); - storage.setStorageDirectories(dataDirs, editsDirs); // 1. For each data directory calculate its state and // check whether all is consistent before transitioning. Map dataDirStates = new HashMap(); - boolean isFormatted = false; - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - StorageState curState; - try { - curState = sd.analyzeStorage(startOpt, storage); - // sd is locked but not opened - switch(curState) { - case NON_EXISTENT: - // name-node fails if any of the configured storage dirs are missing - throw new InconsistentFSStateException(sd.getRoot(), - "storage directory does not exist or is not accessible."); - case NOT_FORMATTED: - break; - case NORMAL: - break; - default: // recovery is possible - sd.doRecover(curState); - } - if (curState != StorageState.NOT_FORMATTED - && startOpt != StartupOption.ROLLBACK) { - // read and verify consistency with other directories - storage.readProperties(sd); - isFormatted = true; - } - if (startOpt == StartupOption.IMPORT && isFormatted) - // import of a checkpoint is allowed only into empty image directories - throw new IOException("Cannot import image from a checkpoint. " - + " NameNode already contains an image in "+ sd.getRoot()); - } catch (IOException ioe) { - sd.unlock(); - throw ioe; - } - dataDirStates.put(sd,curState); + boolean isFormatted = recoverStorageDirs(startOpt, dataDirStates); + + if (LOG.isTraceEnabled()) { + LOG.trace("Data dir states:\n " + + Joiner.on("\n ").withKeyValueSeparator(": ") + .join(dataDirStates)); } if (!isFormatted && startOpt != StartupOption.ROLLBACK - && startOpt != StartupOption.IMPORT) - throw new IOException("NameNode is not formatted."); + && startOpt != StartupOption.IMPORT) { + throw new IOException("NameNode is not formatted."); + } + + int layoutVersion = storage.getLayoutVersion(); if (layoutVersion < Storage.LAST_PRE_UPGRADE_LAYOUT_VERSION) { NNStorage.checkVersionUpgradable(storage.getLayoutVersion()); @@ -274,7 +245,6 @@ boolean recoverTransitionRead(Collection dataDirs, storage.verifyDistributedUpgradeProgress(startOpt); // 2. Format unformatted dirs. - storage.setCheckpointTime(0L); for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); StorageState curState = dataDirStates.get(sd); @@ -307,13 +277,55 @@ boolean recoverTransitionRead(Collection dataDirs, // just load the image } - boolean needToSave = loadFSImage(); - - assert editLog != null : "editLog must be initialized"; - if(!editLog.isOpen()) - editLog.open(); - - return needToSave; + return loadFSImage(); + } + + /** + * For each storage directory, performs recovery of incomplete transitions + * (eg. upgrade, rollback, checkpoint) and inserts the directory's storage + * state into the dataDirStates map. + * @param dataDirStates output of storage directory states + * @return true if there is at least one valid formatted storage directory + */ + private boolean recoverStorageDirs(StartupOption startOpt, + Map dataDirStates) throws IOException { + boolean isFormatted = false; + for (Iterator it = + storage.dirIterator(); it.hasNext();) { + StorageDirectory sd = it.next(); + StorageState curState; + try { + curState = sd.analyzeStorage(startOpt, storage); + // sd is locked but not opened + switch(curState) { + case NON_EXISTENT: + // name-node fails if any of the configured storage dirs are missing + throw new InconsistentFSStateException(sd.getRoot(), + "storage directory does not exist or is not accessible."); + case NOT_FORMATTED: + break; + case NORMAL: + break; + default: // recovery is possible + sd.doRecover(curState); + } + if (curState != StorageState.NOT_FORMATTED + && startOpt != StartupOption.ROLLBACK) { + // read and verify consistency with other directories + storage.readProperties(sd); + isFormatted = true; + } + if (startOpt == StartupOption.IMPORT && isFormatted) + // import of a checkpoint is allowed only into empty image directories + throw new IOException("Cannot import image from a checkpoint. " + + " NameNode already contains an image in " + sd.getRoot()); + } catch (IOException ioe) { + sd.unlock(); + throw ioe; + } + dataDirStates.put(sd,curState); + } + return isFormatted; } private void doUpgrade() throws IOException { @@ -342,12 +354,9 @@ private void doUpgrade() throws IOException { storage.cTime = now(); // generate new cTime for the state int oldLV = storage.getLayoutVersion(); storage.layoutVersion = FSConstants.LAYOUT_VERSION; - storage.setCheckpointTime(now()); List errorSDs = Collections.synchronizedList(new ArrayList()); - List saveThreads = new ArrayList(); - File curDir, prevDir, tmpDir; for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); LOG.info("Starting upgrade of image directory " + sd.getRoot() @@ -356,9 +365,9 @@ private void doUpgrade() throws IOException { + ".\n new LV = " + storage.getLayoutVersion() + "; new CTime = " + storage.getCTime()); try { - curDir = sd.getCurrentDir(); - prevDir = sd.getPreviousDir(); - tmpDir = sd.getPreviousTmp(); + File curDir = sd.getCurrentDir(); + File prevDir = sd.getPreviousDir(); + File tmpDir = sd.getPreviousTmp(); assert curDir.exists() : "Current directory must exist."; assert !prevDir.exists() : "prvious directory must not exist."; assert !tmpDir.exists() : "prvious.tmp directory must not exist."; @@ -367,27 +376,30 @@ private void doUpgrade() throws IOException { // rename current to tmp NNStorage.rename(curDir, tmpDir); - // launch thread to save new image - FSImageSaver saver = new FSImageSaver(sd, errorSDs); - Thread saveThread = new Thread(saver, saver.toString()); - saveThreads.add(saveThread); - saveThread.start(); - + if (!curDir.mkdir()) { + throw new IOException("Cannot create directory " + curDir); + } } catch (Exception e) { - LOG.error("Failed upgrade of image directory " + sd.getRoot(), e); + LOG.error("Failed to move aside pre-upgrade storage " + + "in image directory " + sd.getRoot(), e); errorSDs.add(sd); continue; } } - waitForThreads(saveThreads); - saveThreads.clear(); + storage.reportErrorsOnDirectories(errorSDs); + errorSDs.clear(); + + saveFSImageInAllDirs(editLog.getLastWrittenTxId()); for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); - if (errorSDs.contains(sd)) continue; try { - prevDir = sd.getPreviousDir(); - tmpDir = sd.getPreviousTmp(); + // Write the version file, since saveFsImage above only makes the + // fsimage_, and the directory is otherwise empty. + storage.writeProperties(sd); + + File prevDir = sd.getPreviousDir(); + File tmpDir = sd.getPreviousTmp(); // rename tmp to previous NNStorage.rename(tmpDir, prevDir); } catch (IOException ioe) { @@ -397,15 +409,16 @@ private void doUpgrade() throws IOException { } LOG.info("Upgrade of " + sd.getRoot() + " is complete."); } + storage.reportErrorsOnDirectories(errorSDs); + isUpgradeFinalized = false; - if (!errorSDs.isEmpty()) { - storage.reportErrorsOnDirectories(errorSDs); + if (!storage.getRemovedStorageDirs().isEmpty()) { //during upgrade, it's a fatal error to fail any storage directory - throw new IOException("Upgrade failed in " + errorSDs.size() + throw new IOException("Upgrade failed in " + + storage.getRemovedStorageDirs().size() + " storage directory(ies), previously logged."); } storage.initializeDistributedUpgrade(); - editLog.open(); } private void doRollback() throws IOException { @@ -413,7 +426,7 @@ private void doRollback() throws IOException { // a previous fs states in at least one of the storage directories. // Directories that don't have previous state do not rollback boolean canRollback = false; - FSImage prevState = new FSImage(getFSNamesystem()); + FSImage prevState = new FSImage(conf, getFSNamesystem()); prevState.getStorage().layoutVersion = FSConstants.LAYOUT_VERSION; for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); @@ -499,25 +512,27 @@ private void doFinalize(StorageDirectory sd) throws IOException { */ void doImportCheckpoint() throws IOException { FSNamesystem fsNamesys = getFSNamesystem(); - FSImage ckptImage = new FSImage(fsNamesys); + FSImage ckptImage = new FSImage(conf, fsNamesys, + checkpointDirs, checkpointEditsDirs); // replace real image with the checkpoint image FSImage realImage = fsNamesys.getFSImage(); assert realImage == this; fsNamesys.dir.fsImage = ckptImage; // load from the checkpoint dirs try { - ckptImage.recoverTransitionRead(checkpointDirs, checkpointEditsDirs, - StartupOption.REGULAR); + ckptImage.recoverTransitionRead(StartupOption.REGULAR); } finally { ckptImage.close(); } // return back the real image realImage.getStorage().setStorageInfo(ckptImage.getStorage()); - storage.setCheckpointTime(ckptImage.getStorage().getCheckpointTime()); + realImage.getEditLog().setNextTxId(ckptImage.getEditLog().getLastWrittenTxId()+1); + fsNamesys.dir.fsImage = realImage; realImage.getStorage().setBlockPoolID(ckptImage.getBlockPoolID()); // and save it but keep the same checkpointTime - saveNamespace(false); + saveNamespace(); + getStorage().writeAll(); } void finalizeUpgrade() throws IOException { @@ -535,51 +550,23 @@ public FSEditLog getEditLog() { return editLog; } - // - // Atomic move sequence, to recover from interrupted checkpoint - // - boolean recoverInterruptedCheckpoint(StorageDirectory nameSD, - StorageDirectory editsSD) - throws IOException { - boolean needToSave = false; - File curFile = NNStorage.getStorageFile(nameSD, NameNodeFile.IMAGE); - File ckptFile = NNStorage.getStorageFile(nameSD, NameNodeFile.IMAGE_NEW); + void openEditLog() throws IOException { + assert editLog != null : "editLog must be initialized"; + Preconditions.checkState(!editLog.isOpen(), + "edit log should not yet be open"); + editLog.open(); + storage.writeTransactionIdFileToStorage(editLog.getCurSegmentTxId()); + }; + + /** + * Toss the current image and namesystem, reloading from the specified + * file. + */ + void reloadFromImageFile(File file) throws IOException { + namesystem.dir.reset(); - // - // If we were in the midst of a checkpoint - // - if (ckptFile.exists()) { - needToSave = true; - if (NNStorage.getStorageFile(editsSD, NameNodeFile.EDITS_NEW).exists()) { - // - // checkpointing migth have uploaded a new - // merged image, but we discard it here because we are - // not sure whether the entire merged image was uploaded - // before the namenode crashed. - // - if (!ckptFile.delete()) { - throw new IOException("Unable to delete " + ckptFile); - } - } else { - // - // checkpointing was in progress when the namenode - // shutdown. The fsimage.ckpt was created and the edits.new - // file was moved to edits. We complete that checkpoint by - // moving fsimage.new to fsimage. There is no need to - // update the fstime file here. renameTo fails on Windows - // if the destination file already exists. - // - if (!ckptFile.renameTo(curFile)) { - if (!curFile.delete()) - LOG.warn("Unable to delete dir " + curFile + " before rename"); - if (!ckptFile.renameTo(curFile)) { - throw new IOException("Unable to rename " + ckptFile + - " to " + curFile); - } - } - } - } - return needToSave; + LOG.debug("Reloading namespace from " + file); + loadFSImage(file); } /** @@ -598,124 +585,131 @@ boolean recoverInterruptedCheckpoint(StorageDirectory nameSD, * @throws IOException */ boolean loadFSImage() throws IOException { - long latestNameCheckpointTime = Long.MIN_VALUE; - long latestEditsCheckpointTime = Long.MIN_VALUE; - boolean needToSave = false; - isUpgradeFinalized = true; + FSImageStorageInspector inspector = storage.readAndInspectDirs(); - StorageDirectory latestNameSD = null; - StorageDirectory latestEditsSD = null; + isUpgradeFinalized = inspector.isUpgradeFinalized(); - Collection imageDirs = new ArrayList(); - Collection editsDirs = new ArrayList(); + boolean needToSave = inspector.needToSave(); - // Set to determine if all of storageDirectories share the same checkpoint - Set checkpointTimes = new HashSet(); + // Plan our load. This will throw if it's impossible to load from the + // data that's available. + LoadPlan loadPlan = inspector.createLoadPlan(); + LOG.debug("Planning to load image using following plan:\n" + loadPlan); - // Process each of the storage directories to find the pair of - // newest image file and edit file - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - - // Was the file just formatted? - if (!sd.getVersionFile().exists()) { - needToSave |= true; - continue; - } - - boolean imageExists = false; - boolean editsExists = false; - - // Determine if sd is image, edits or both - if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) { - imageExists = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE).exists(); - imageDirs.add(sd.getRoot().getCanonicalPath()); - } - - if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { - editsExists = NNStorage.getStorageFile(sd, NameNodeFile.EDITS).exists(); - editsDirs.add(sd.getRoot().getCanonicalPath()); - } - - long checkpointTime = storage.readCheckpointTime(sd); - - checkpointTimes.add(checkpointTime); - - if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE) && - (latestNameCheckpointTime < checkpointTime) && imageExists) { - latestNameCheckpointTime = checkpointTime; - latestNameSD = sd; - } - - if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS) && - (latestEditsCheckpointTime < checkpointTime) && editsExists) { - latestEditsCheckpointTime = checkpointTime; - latestEditsSD = sd; - } - - // check that we have a valid, non-default checkpointTime - if (checkpointTime <= 0L) - needToSave |= true; - - // set finalized flag - isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists(); - } - - // We should have at least one image and one edits dirs - if (latestNameSD == null) - throw new IOException("Image file is not found in " + imageDirs); - if (latestEditsSD == null) - throw new IOException("Edits file is not found in " + editsDirs); - - // Make sure we are loading image and edits from same checkpoint - if (latestNameCheckpointTime > latestEditsCheckpointTime - && latestNameSD != latestEditsSD - && latestNameSD.getStorageDirType() == NameNodeDirType.IMAGE - && latestEditsSD.getStorageDirType() == NameNodeDirType.EDITS) { - // This is a rare failure when NN has image-only and edits-only - // storage directories, and fails right after saving images, - // in some of the storage directories, but before purging edits. - // See -NOTE- in saveNamespace(). - LOG.error("This is a rare failure scenario!!!"); - LOG.error("Image checkpoint time " + latestNameCheckpointTime + - " > edits checkpoint time " + latestEditsCheckpointTime); - LOG.error("Name-node will treat the image as the latest state of " + - "the namespace. Old edits will be discarded."); - } else if (latestNameCheckpointTime != latestEditsCheckpointTime) - throw new IOException("Inconsistent storage detected, " + - "image and edits checkpoint times do not match. " + - "image checkpoint time = " + latestNameCheckpointTime + - "edits checkpoint time = " + latestEditsCheckpointTime); - - // If there was more than one checkpointTime recorded we should save - needToSave |= checkpointTimes.size() != 1; // Recover from previous interrupted checkpoint, if any - needToSave |= recoverInterruptedCheckpoint(latestNameSD, latestEditsSD); + needToSave |= loadPlan.doRecovery(); // // Load in bits // - storage.readProperties(latestNameSD); - needToSave |= loadFSImage(NNStorage.getStorageFile(latestNameSD, - NameNodeFile.IMAGE)); + StorageDirectory sdForProperties = + loadPlan.getStorageDirectoryForProperties(); + storage.readProperties(sdForProperties); + File imageFile = loadPlan.getImageFile(); + + try { + if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, + getLayoutVersion())) { + // For txid-based layout, we should have a .md5 file + // next to the image file + loadFSImage(imageFile); + } else if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM, + getLayoutVersion())) { + // In 0.22, we have the checksum stored in the VERSION file. + String md5 = storage.getDeprecatedProperty( + NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY); + if (md5 == null) { + throw new InconsistentFSStateException(sdForProperties.getRoot(), + "Message digest property " + + NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY + + " not set for storage directory " + sdForProperties.getRoot()); + } + loadFSImage(imageFile, new MD5Hash(md5)); + } else { + // We don't have any record of the md5sum + loadFSImage(imageFile, null); + } + } catch (IOException ioe) { + throw new IOException("Failed to load image from " + loadPlan.getImageFile(), ioe); + } - // Load latest edits - if (latestNameCheckpointTime > latestEditsCheckpointTime) - // the image is already current, discard edits - needToSave |= true; - else // latestNameCheckpointTime == latestEditsCheckpointTime - needToSave |= (loadFSEdits(latestEditsSD) > 0); + long numLoaded = loadEdits(loadPlan.getEditsFiles()); + needToSave |= needsResaveBasedOnStaleCheckpoint(imageFile, numLoaded); + // update the txid for the edit log + editLog.setNextTxId(storage.getMostRecentCheckpointTxId() + numLoaded + 1); return needToSave; } + + /** + * @param imageFile the image file that was loaded + * @param numEditsLoaded the number of edits loaded from edits logs + * @return true if the NameNode should automatically save the namespace + * when it is started, due to the latest checkpoint being too old. + */ + private boolean needsResaveBasedOnStaleCheckpoint( + File imageFile, long numEditsLoaded) { + final long checkpointPeriod = conf.getLong( + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); + final long checkpointTxnCount = conf.getLong( + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, + DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); + long checkpointAge = System.currentTimeMillis() - imageFile.lastModified(); + + return (checkpointAge > checkpointPeriod * 1000) || + (numEditsLoaded > checkpointTxnCount); + } + + /** + * Load the specified list of edit files into the image. + * @return the number of transactions loaded + */ + protected long loadEdits(List editLogs) throws IOException { + LOG.debug("About to load edits:\n " + Joiner.on("\n ").join(editLogs)); + + long startingTxId = getLastAppliedTxId() + 1; + + FSEditLogLoader loader = new FSEditLogLoader(namesystem); + int numLoaded = 0; + // Load latest edits + for (File edits : editLogs) { + LOG.debug("Reading " + edits + " expecting start txid #" + startingTxId); + EditLogFileInputStream editIn = new EditLogFileInputStream(edits); + int thisNumLoaded = loader.loadFSEdits(editIn, startingTxId); + startingTxId += thisNumLoaded; + numLoaded += thisNumLoaded; + lastAppliedTxId += thisNumLoaded; + editIn.close(); + } + + // update the counts + getFSNamesystem().dir.updateCountForINodeWithQuota(); + return numLoaded; + } + + + /** + * Load the image namespace from the given image file, verifying + * it against the MD5 sum stored in its associated .md5 file. + */ + private void loadFSImage(File imageFile) throws IOException { + MD5Hash expectedMD5 = MD5FileUtils.readStoredMd5ForFile(imageFile); + if (expectedMD5 == null) { + throw new IOException("No MD5 file found corresponding to image file " + + imageFile); + } + loadFSImage(imageFile, expectedMD5); + } + /** * Load in the filesystem image from file. It's a big list of * filenames and blocks. Return whether we should * "re-save" and consolidate the edit-logs */ - boolean loadFSImage(File curFile) throws IOException { + private void loadFSImage(File curFile, MD5Hash expectedMd5) throws IOException { FSImageFormat.Loader loader = new FSImageFormat.Loader( conf, getFSNamesystem()); loader.load(curFile); @@ -724,63 +718,32 @@ boolean loadFSImage(File curFile) throws IOException { // Check that the image digest we loaded matches up with what // we expected MD5Hash readImageMd5 = loader.getLoadedImageMd5(); - if (storage.getImageDigest() == null) { - storage.setImageDigest(readImageMd5); // set this fsimage's checksum - } else if (!storage.getImageDigest().equals(readImageMd5)) { + if (expectedMd5 != null && + !expectedMd5.equals(readImageMd5)) { throw new IOException("Image file " + curFile + " is corrupt with MD5 checksum of " + readImageMd5 + - " but expecting " + storage.getImageDigest()); + " but expecting " + expectedMd5); } - storage.namespaceID = loader.getLoadedNamespaceID(); - storage.layoutVersion = loader.getLoadedImageVersion(); - - boolean needToSave = - loader.getLoadedImageVersion() != FSConstants.LAYOUT_VERSION; - return needToSave; - } - - /** - * Load and merge edits from two edits files - * - * @param sd storage directory - * @return number of edits loaded - * @throws IOException - */ - int loadFSEdits(StorageDirectory sd) throws IOException { - FSEditLogLoader loader = new FSEditLogLoader(namesystem); - - int numEdits = 0; - EditLogFileInputStream edits = - new EditLogFileInputStream(NNStorage.getStorageFile(sd, - NameNodeFile.EDITS)); - - numEdits = loader.loadFSEdits(edits); - edits.close(); - File editsNew = NNStorage.getStorageFile(sd, NameNodeFile.EDITS_NEW); - - if (editsNew.exists() && editsNew.length() > 0) { - edits = new EditLogFileInputStream(editsNew); - numEdits += loader.loadFSEdits(edits); - edits.close(); - } - if (numEdits == 0 && editsNew.exists()) { - numEdits++; - } - // update the counts. - getFSNamesystem().dir.updateCountForINodeWithQuota(); - - return numEdits; + long txId = loader.getLoadedImageTxId(); + LOG.info("Loaded image for txid " + txId + " from " + curFile); + lastAppliedTxId = txId; + storage.setMostRecentCheckpointTxId(txId); } /** * Save the contents of the FS image to the file. */ - void saveFSImage(File newFile) throws IOException { + void saveFSImage(StorageDirectory sd, long txid) throws IOException { + File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid); + File dstFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid); + FSImageFormat.Saver saver = new FSImageFormat.Saver(); FSImageCompression compression = FSImageCompression.createCompression(conf); - saver.save(newFile, getFSNamesystem(), compression); - storage.setImageDigest(saver.getSavedDigest()); + saver.save(newFile, txid, getFSNamesystem(), compression); + + MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest()); + storage.setMostRecentCheckpointTxId(txid); } /** @@ -797,15 +760,17 @@ void saveFSImage(File newFile) throws IOException { private class FSImageSaver implements Runnable { private StorageDirectory sd; private List errorSDs; + private final long txid; - FSImageSaver(StorageDirectory sd, List errorSDs) { + FSImageSaver(StorageDirectory sd, List errorSDs, long txid) { this.sd = sd; this.errorSDs = errorSDs; + this.txid = txid; } public void run() { try { - saveCurrent(sd); + saveFSImage(sd, txid); } catch (Throwable t) { LOG.error("Unable to save image for " + sd.getRoot(), t); errorSDs.add(sd); @@ -831,295 +796,125 @@ private void waitForThreads(List threads) { } } /** - * Save the contents of the FS image and create empty edits. - * - * In order to minimize the recovery effort in case of failure during - * saveNamespace the algorithm reduces discrepancy between directory states - * by performing updates in the following order: - *

      - *
    1. rename current to lastcheckpoint.tmp for all of them,
    2. - *
    3. save image and recreate edits for all of them,
    4. - *
    5. rename lastcheckpoint.tmp to previous.checkpoint.
    6. - *
    - * On stage (2) we first save all images, then recreate edits. - * Otherwise the name-node may purge all edits and fail, - * in which case the journal will be lost. + * Save the contents of the FS image to a new image file in each of the + * current storage directories. */ - void saveNamespace(boolean renewCheckpointTime) throws IOException { - - // try to restore all failed edit logs here + void saveNamespace() throws IOException { assert editLog != null : "editLog must be initialized"; storage.attemptRestoreRemovedStorage(); - editLog.close(); - if(renewCheckpointTime) - storage.setCheckpointTime(now()); - List errorSDs = - Collections.synchronizedList(new ArrayList()); - - // mv current -> lastcheckpoint.tmp - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - try { - storage.moveCurrent(sd); - } catch(IOException ie) { - LOG.error("Unable to move current for " + sd.getRoot(), ie); - errorSDs.add(sd); + boolean editLogWasOpen = editLog.isOpen(); + + if (editLogWasOpen) { + editLog.endCurrentLogSegment(true); + } + long imageTxId = editLog.getLastWrittenTxId(); + try { + saveFSImageInAllDirs(imageTxId); + storage.writeAll(); + } finally { + if (editLogWasOpen) { + editLog.startLogSegment(imageTxId + 1, true); + // Take this opportunity to note the current transaction + storage.writeTransactionIdFileToStorage(imageTxId + 1); } } + + } + + protected void saveFSImageInAllDirs(long txid) throws IOException { + if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) { + throw new IOException("No image directories available!"); + } + + List errorSDs = + Collections.synchronizedList(new ArrayList()); List saveThreads = new ArrayList(); // save images into current for (Iterator it = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) { StorageDirectory sd = it.next(); - if (errorSDs.contains(sd)) { - continue; - } - try { - FSImageSaver saver = new FSImageSaver(sd, errorSDs); - Thread saveThread = new Thread(saver, saver.toString()); - saveThreads.add(saveThread); - saveThread.start(); - } catch (Exception e) { - LOG.error("Failed save to image directory " + sd.getRoot(), e); - errorSDs.add(sd); - continue; - } + FSImageSaver saver = new FSImageSaver(sd, errorSDs, txid); + Thread saveThread = new Thread(saver, saver.toString()); + saveThreads.add(saveThread); + saveThread.start(); } waitForThreads(saveThreads); saveThreads.clear(); + storage.reportErrorsOnDirectories(errorSDs); - // -NOTE- - // If NN has image-only and edits-only storage directories and fails here - // the image will have the latest namespace state. - // During startup the image-only directories will recover by discarding - // lastcheckpoint.tmp, while - // the edits-only directories will recover by falling back - // to the old state contained in their lastcheckpoint.tmp. - // The edits directories should be discarded during startup because their - // checkpointTime is older than that of image directories. - // recreate edits in current - for (Iterator it - = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) { - StorageDirectory sd = it.next(); - if (errorSDs.contains(sd)) { - continue; - } - - // if this directory already stores the image and edits, then it was - // already processed in the earlier loop. - if (sd.getStorageDirType() == NameNodeDirType.IMAGE_AND_EDITS) { - continue; - } - - try { - FSImageSaver saver = new FSImageSaver(sd, errorSDs); - Thread saveThread = new Thread(saver, saver.toString()); - saveThreads.add(saveThread); - saveThread.start(); - } catch (Exception e) { - LOG.error("Failed save to edits directory " + sd.getRoot(), e); - errorSDs.add(sd); - continue; - } + if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) { + throw new IOException( + "Failed to save in any storage directories while saving namespace."); } - waitForThreads(saveThreads); - // mv lastcheckpoint.tmp -> previous.checkpoint - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - if (errorSDs.contains(sd)) { - continue; - } - try { - storage.moveLastCheckpoint(sd); - } catch(IOException ie) { - LOG.error("Unable to move last checkpoint for " + sd.getRoot(), ie); - errorSDs.add(sd); - continue; - } - } + renameCheckpoint(txid); + // Since we now have a new checkpoint, we can clean up some + // old edit logs and checkpoints. + purgeOldStorage(); + } + + /** + * Purge any files in the storage directories that are no longer + * necessary. + */ + public void purgeOldStorage() { try { - storage.reportErrorsOnDirectories(errorSDs); - - // If there was an error in every storage dir, each one will have been - // removed from the list of storage directories. - if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0 || - storage.getNumStorageDirs(NameNodeDirType.EDITS) == 0) { - throw new IOException("Failed to save any storage directories while saving namespace"); - } - - if(!editLog.isOpen()) editLog.open(); - } finally { - ckptState = CheckpointStates.UPLOAD_DONE; + archivalManager.purgeOldStorage(); + } catch (Exception e) { + LOG.warn("Unable to purge old storage", e); } } - /** - * Save current image and empty journal into {@code current} directory. - */ - protected void saveCurrent(StorageDirectory sd) throws IOException { - if (storage.getLayoutVersion() != FSConstants.LAYOUT_VERSION) { - throw new IllegalStateException( - "NN with storage version " + FSConstants.LAYOUT_VERSION + - "cannot save an image with version " + storage.getLayoutVersion()); - } - File curDir = sd.getCurrentDir(); - NameNodeDirType dirType = (NameNodeDirType)sd.getStorageDirType(); - // save new image or new edits - if (!curDir.exists() && !curDir.mkdir()) - throw new IOException("Cannot create directory " + curDir); - if (dirType.isOfType(NameNodeDirType.IMAGE)) - saveFSImage(NNStorage.getStorageFile(sd, NameNodeFile.IMAGE)); - if (dirType.isOfType(NameNodeDirType.EDITS)) - editLog.createEditLogFile(NNStorage.getStorageFile(sd, - NameNodeFile.EDITS)); - // write version and time files - storage.writeProperties(sd); - } - - - /** - * Moves fsimage.ckpt to fsImage and edits.new to edits - * Reopens the new edits file. - */ - void rollFSImage(CheckpointSignature sig, - boolean renewCheckpointTime) throws IOException { - sig.validateStorageInfo(this); - rollFSImage(true); - } - - private void rollFSImage(boolean renewCheckpointTime) - throws IOException { - if (ckptState != CheckpointStates.UPLOAD_DONE - && !(ckptState == CheckpointStates.ROLLED_EDITS - && storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0)) { - throw new IOException("Cannot roll fsImage before rolling edits log."); - } - - for (Iterator it - = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) { - StorageDirectory sd = it.next(); - File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW); - if (!ckpt.exists()) { - throw new IOException("Checkpoint file " + ckpt + - " does not exist"); - } - } - editLog.purgeEditLog(); // renamed edits.new to edits - if(LOG.isDebugEnabled()) { - LOG.debug("rollFSImage after purgeEditLog: storageList=" - + storage.listStorageDirectories()); - } - // - // Renames new image - // - renameCheckpoint(); - resetVersion(renewCheckpointTime, newImageDigest); - } - /** * Renames new image */ - void renameCheckpoint() throws IOException { + private void renameCheckpoint(long txid) throws IOException { ArrayList al = null; - for (Iterator it - = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) { - StorageDirectory sd = it.next(); - File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW); - File curFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE); - // renameTo fails on Windows if the destination file - // already exists. - if(LOG.isDebugEnabled()) { - LOG.debug("renaming " + ckpt.getAbsolutePath() - + " to " + curFile.getAbsolutePath()); - } - if (!ckpt.renameTo(curFile)) { - if (!curFile.delete() || !ckpt.renameTo(curFile)) { - LOG.warn("renaming " + ckpt.getAbsolutePath() + " to " + - curFile.getAbsolutePath() + " FAILED"); - if(al == null) al = new ArrayList (1); - al.add(sd); + for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) { + try { + renameCheckpointInDir(sd, txid); + } catch (IOException ioe) { + LOG.warn("Unable to rename checkpoint in " + sd, ioe); + if (al == null) { + al = Lists.newArrayList(); } + al.add(sd); } } if(al != null) storage.reportErrorsOnDirectories(al); } - /** - * Updates version and fstime files in all directories (fsimage and edits). - */ - void resetVersion(boolean renewCheckpointTime, MD5Hash newImageDigest) + private void renameCheckpointInDir(StorageDirectory sd, long txid) throws IOException { - storage.layoutVersion = FSConstants.LAYOUT_VERSION; - if(renewCheckpointTime) - storage.setCheckpointTime(now()); - storage.setImageDigest(newImageDigest); - - ArrayList al = null; - for (Iterator it = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - // delete old edits if sd is the image only the directory - if (!sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { - File editsFile = NNStorage.getStorageFile(sd, NameNodeFile.EDITS); - if(editsFile.exists() && !editsFile.delete()) - throw new IOException("Cannot delete edits file " - + editsFile.getCanonicalPath()); - } - // delete old fsimage if sd is the edits only the directory - if (!sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) { - File imageFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE); - if(imageFile.exists() && !imageFile.delete()) - throw new IOException("Cannot delete image file " - + imageFile.getCanonicalPath()); - } - try { - storage.writeProperties(sd); - } catch (IOException e) { - LOG.error("Cannot write file " + sd.getRoot(), e); - - if(al == null) al = new ArrayList (1); - al.add(sd); - } + File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid); + File curFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid); + // renameTo fails on Windows if the destination file + // already exists. + if(LOG.isDebugEnabled()) { + LOG.debug("renaming " + ckpt.getAbsolutePath() + + " to " + curFile.getAbsolutePath()); } - if(al != null) storage.reportErrorsOnDirectories(al); - ckptState = FSImage.CheckpointStates.START; + if (!ckpt.renameTo(curFile)) { + if (!curFile.delete() || !ckpt.renameTo(curFile)) { + throw new IOException("renaming " + ckpt.getAbsolutePath() + " to " + + curFile.getAbsolutePath() + " FAILED"); + } + } } CheckpointSignature rollEditLog() throws IOException { getEditLog().rollEditLog(); - ckptState = CheckpointStates.ROLLED_EDITS; - // If checkpoint fails this should be the most recent image, therefore - storage.incrementCheckpointTime(); + // Record this log segment ID in all of the storage directories, so + // we won't miss this log segment on a restart if the edits directories + // go missing. + storage.writeTransactionIdFileToStorage(getEditLog().getCurSegmentTxId()); return new CheckpointSignature(this); } - /** - * This is called just before a new checkpoint is uploaded to the - * namenode. - */ - void validateCheckpointUpload(CheckpointSignature sig) throws IOException { - if (ckptState != CheckpointStates.ROLLED_EDITS) { - throw new IOException("Namenode is not expecting an new image " + - ckptState); - } - // verify token - long modtime = getEditLog().getFsEditTime(); - if (sig.editsTime != modtime) { - throw new IOException("Namenode has an edit log with timestamp of " + - DATE_FORM.format(new Date(modtime)) + - " but new checkpoint was created using editlog " + - " with timestamp " + - DATE_FORM.format(new Date(sig.editsTime)) + - ". Checkpoint Aborted."); - } - sig.validateStorageInfo(this); - ckptState = FSImage.CheckpointStates.UPLOAD_START; - } - /** * Start checkpoint. *

    @@ -1150,34 +945,23 @@ else if(bnReg.isRole(NamenodeRole.NAMENODE)) + " role " + bnReg.getRole() + ": checkpoint is not allowed."; else if(bnReg.getLayoutVersion() < storage.getLayoutVersion() || (bnReg.getLayoutVersion() == storage.getLayoutVersion() - && bnReg.getCTime() > storage.getCTime()) - || (bnReg.getLayoutVersion() == storage.getLayoutVersion() - && bnReg.getCTime() == storage.getCTime() - && bnReg.getCheckpointTime() > storage.getCheckpointTime())) + && bnReg.getCTime() > storage.getCTime())) // remote node has newer image age msg = "Name node " + bnReg.getAddress() + " has newer image layout version: LV = " +bnReg.getLayoutVersion() + " cTime = " + bnReg.getCTime() - + " checkpointTime = " + bnReg.getCheckpointTime() + ". Current version: LV = " + storage.getLayoutVersion() - + " cTime = " + storage.getCTime() - + " checkpointTime = " + storage.getCheckpointTime(); + + " cTime = " + storage.getCTime(); if(msg != null) { LOG.error(msg); return new NamenodeCommand(NamenodeProtocol.ACT_SHUTDOWN); } - boolean isImgObsolete = true; - if(bnReg.getLayoutVersion() == storage.getLayoutVersion() - && bnReg.getCTime() == storage.getCTime() - && bnReg.getCheckpointTime() == storage.getCheckpointTime()) - isImgObsolete = false; boolean needToReturnImg = true; if(storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) // do not return image if there are no image directories needToReturnImg = false; CheckpointSignature sig = rollEditLog(); - getEditLog().logJSpoolStart(bnReg, nnReg); - return new CheckpointCommand(sig, isImgObsolete, needToReturnImg); + return new CheckpointCommand(sig, needToReturnImg); } /** @@ -1196,32 +980,41 @@ else if(bnReg.getLayoutVersion() < storage.getLayoutVersion() void endCheckpoint(CheckpointSignature sig, NamenodeRole remoteNNRole) throws IOException { sig.validateStorageInfo(this); - // Renew checkpoint time for the active if the other is a checkpoint-node. - // The checkpoint-node should have older image for the next checkpoint - // to take effect. - // The backup-node always has up-to-date image and will have the same - // checkpoint time as the active node. - boolean renewCheckpointTime = remoteNNRole.equals(NamenodeRole.CHECKPOINT); - rollFSImage(sig, renewCheckpointTime); - } - - CheckpointStates getCheckpointState() { - return ckptState; - } - - void setCheckpointState(CheckpointStates cs) { - ckptState = cs; } /** - * This is called when a checkpoint upload finishes successfully. + * This is called by the 2NN after having downloaded an image, and by + * the NN after having received a new image from the 2NN. It + * renames the image from fsimage_N.ckpt to fsimage_N and also + * saves the related .md5 file into place. */ - synchronized void checkpointUploadDone() { - ckptState = CheckpointStates.UPLOAD_DONE; + synchronized void saveDigestAndRenameCheckpointImage( + long txid, MD5Hash digest) throws IOException { + renameCheckpoint(txid); + List badSds = Lists.newArrayList(); + + for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) { + File imageFile = NNStorage.getImageFile(sd, txid); + try { + MD5FileUtils.saveMD5File(imageFile, digest); + } catch (IOException ioe) { + badSds.add(sd); + } + } + storage.reportErrorsOnDirectories(badSds); + + // So long as this is the newest image available, + // advertise it as such to other checkpointers + // from now on + if (txid > storage.getMostRecentCheckpointTxId()) { + storage.setMostRecentCheckpointTxId(txid); + } } synchronized public void close() throws IOException { - getEditLog().close(); + if (editLog != null) { // 2NN doesn't have any edit log + getEditLog().close(); + } storage.close(); } @@ -1257,30 +1050,6 @@ public NNStorage getStorage() { return storage; } - @Override // NNStorageListener - public void errorOccurred(StorageDirectory sd) throws IOException { - // do nothing, - } - - @Override // NNStorageListener - public void formatOccurred(StorageDirectory sd) throws IOException { - if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) { - sd.lock(); - try { - saveCurrent(sd); - } finally { - sd.unlock(); - } - LOG.info("Storage directory " + sd.getRoot() - + " has been successfully formatted."); - } - }; - - @Override // NNStorageListener - public void directoryAvailable(StorageDirectory sd) throws IOException { - // do nothing - } - public int getLayoutVersion() { return storage.getLayoutVersion(); } @@ -1296,4 +1065,8 @@ public String getClusterID() { public String getBlockPoolID() { return storage.getBlockPoolID(); } + + public synchronized long getLastAppliedTxId() { + return lastAppliedTxId; + } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java index 46d08970b32..5bad6be91dd 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java @@ -44,6 +44,7 @@ import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature; import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo; import org.apache.hadoop.hdfs.server.common.GenerationStamp; +import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.io.Text; @@ -71,10 +72,8 @@ static class Loader { /** Set to true once a file has been loaded using this loader. */ private boolean loaded = false; - /** The image version of the loaded file */ - private int imgVersion; - /** The namespace ID of the loaded file */ - private int imgNamespaceID; + /** The transaction ID of the last edit represented by the loaded file */ + private long imgTxId; /** The MD5 sum of the loaded file */ private MD5Hash imgDigest; @@ -83,15 +82,6 @@ static class Loader { this.namesystem = namesystem; } - /** - * Return the version number of the image that has been loaded. - * @throws IllegalStateException if load() has not yet been called. - */ - int getLoadedImageVersion() { - checkLoaded(); - return imgVersion; - } - /** * Return the MD5 checksum of the image that has been loaded. * @throws IllegalStateException if load() has not yet been called. @@ -101,13 +91,9 @@ MD5Hash getLoadedImageMd5() { return imgDigest; } - /** - * Return the namespace ID of the image that has been loaded. - * @throws IllegalStateException if load() has not yet been called. - */ - int getLoadedNamespaceID() { + long getLoadedImageTxId() { checkLoaded(); - return imgNamespaceID; + return imgTxId; } /** @@ -156,10 +142,14 @@ void load(File curFile) * it should not contain version and namespace fields */ // read image version: first appeared in version -1 - imgVersion = in.readInt(); + int imgVersion = in.readInt(); + if(getLayoutVersion() != imgVersion) + throw new InconsistentFSStateException(curFile, + "imgVersion " + imgVersion + + " expected to be " + getLayoutVersion()); // read namespaceID: first appeared in version -2 - imgNamespaceID = in.readInt(); + in.readInt(); // read number of files long numFiles = readNumFiles(in); @@ -169,6 +159,15 @@ void load(File curFile) long genstamp = in.readLong(); namesystem.setGenerationStamp(genstamp); } + + // read the transaction ID of the last edit represented by + // this image + if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) { + imgTxId = in.readLong(); + } else { + imgTxId = 0; + } + // read compression related info FSImageCompression compression; @@ -234,7 +233,7 @@ private void updateRootAttr(INode root) { private void loadLocalNameINodes(long numFiles, DataInputStream in) throws IOException { assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION, - imgVersion); + getLayoutVersion()); assert numFiles > 0; // load root @@ -324,11 +323,12 @@ private void loadFullNameINodes(long numFiles, * @return an inode */ private INode loadINode(DataInputStream in) - throws IOException { + throws IOException { long modificationTime = 0; long atime = 0; long blockSize = 0; + int imgVersion = getLayoutVersion(); short replication = in.readShort(); replication = namesystem.adjustReplication(replication); modificationTime = in.readLong(); @@ -396,7 +396,10 @@ private INode loadINode(DataInputStream in) modificationTime, atime, nsQuota, dsQuota, blockSize); } - private void loadDatanodes(DataInputStream in) throws IOException { + private void loadDatanodes(DataInputStream in) + throws IOException { + int imgVersion = getLayoutVersion(); + if (imgVersion > -3) // pre datanode image version return; if (imgVersion <= -12) { @@ -412,6 +415,7 @@ private void loadDatanodes(DataInputStream in) throws IOException { private void loadFilesUnderConstruction(DataInputStream in) throws IOException { FSDirectory fsDir = namesystem.dir; + int imgVersion = getLayoutVersion(); if (imgVersion > -13) // pre lease image version return; int size = in.readInt(); @@ -437,7 +441,10 @@ private void loadFilesUnderConstruction(DataInputStream in) } } - private void loadSecretManagerState(DataInputStream in) throws IOException { + private void loadSecretManagerState(DataInputStream in) + throws IOException { + int imgVersion = getLayoutVersion(); + if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) { //SecretManagerState is not available. //This must not happen if security is turned on. @@ -446,8 +453,14 @@ private void loadSecretManagerState(DataInputStream in) throws IOException { namesystem.loadSecretManagerState(in); } + private int getLayoutVersion() { + return namesystem.getFSImage().getStorage().getLayoutVersion(); + } + + private long readNumFiles(DataInputStream in) + throws IOException { + int imgVersion = getLayoutVersion(); - private long readNumFiles(DataInputStream in) throws IOException { if (LayoutVersion.supports(Feature.NAMESPACE_QUOTA, imgVersion)) { return in.readLong(); } else { @@ -526,6 +539,7 @@ MD5Hash getSavedDigest() { } void save(File newFile, + long txid, FSNamesystem sourceNamesystem, FSImageCompression compression) throws IOException { @@ -542,9 +556,11 @@ void save(File newFile, DataOutputStream out = new DataOutputStream(fos); try { out.writeInt(FSConstants.LAYOUT_VERSION); - out.writeInt(sourceNamesystem.getFSImage().getStorage().getNamespaceID()); // TODO bad dependency + out.writeInt(sourceNamesystem.getFSImage() + .getStorage().getNamespaceID()); // TODO bad dependency out.writeLong(fsDir.rootDir.numItemsInTree()); out.writeLong(sourceNamesystem.getGenerationStamp()); + out.writeLong(txid); // write compression info and set up compressed stream out = compression.writeHeaderAndWrapStream(fos); diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java new file mode 100644 index 00000000000..cec2eeff2d8 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java @@ -0,0 +1,272 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + + +import java.io.DataInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; +import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; +import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; + +/** + * Inspects a FSImage storage directory in the "old" (pre-HDFS-1073) format. + * This format has the following data files: + * - fsimage + * - fsimage.ckpt (when checkpoint is being uploaded) + * - edits + * - edits.new (when logs are "rolled") + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +class FSImagePreTransactionalStorageInspector extends FSImageStorageInspector { + private static final Log LOG = + LogFactory.getLog(FSImagePreTransactionalStorageInspector.class); + + /* Flag if there is at least one storage dir that doesn't contain the newest + * fstime */ + private boolean hasOutOfDateStorageDirs = false; + /* Flag set false if there are any "previous" directories found */ + private boolean isUpgradeFinalized = true; + + // Track the name and edits dir with the latest times + private long latestNameCheckpointTime = Long.MIN_VALUE; + private long latestEditsCheckpointTime = Long.MIN_VALUE; + private StorageDirectory latestNameSD = null; + private StorageDirectory latestEditsSD = null; + + /** Set to determine if all of storageDirectories share the same checkpoint */ + Set checkpointTimes = new HashSet(); + + private List imageDirs = new ArrayList(); + private List editsDirs = new ArrayList(); + + @Override + void inspectDirectory(StorageDirectory sd) throws IOException { + // Was the file just formatted? + if (!sd.getVersionFile().exists()) { + hasOutOfDateStorageDirs = true; + return; + } + + boolean imageExists = false; + boolean editsExists = false; + + // Determine if sd is image, edits or both + if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) { + imageExists = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE).exists(); + imageDirs.add(sd.getRoot().getCanonicalPath()); + } + + if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { + editsExists = NNStorage.getStorageFile(sd, NameNodeFile.EDITS).exists(); + editsDirs.add(sd.getRoot().getCanonicalPath()); + } + + long checkpointTime = readCheckpointTime(sd); + + checkpointTimes.add(checkpointTime); + + if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE) && + (latestNameCheckpointTime < checkpointTime) && imageExists) { + latestNameCheckpointTime = checkpointTime; + latestNameSD = sd; + } + + if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS) && + (latestEditsCheckpointTime < checkpointTime) && editsExists) { + latestEditsCheckpointTime = checkpointTime; + latestEditsSD = sd; + } + + // check that we have a valid, non-default checkpointTime + if (checkpointTime <= 0L) + hasOutOfDateStorageDirs = true; + + // set finalized flag + isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists(); + } + + /** + * Determine the checkpoint time of the specified StorageDirectory + * + * @param sd StorageDirectory to check + * @return If file exists and can be read, last checkpoint time. If not, 0L. + * @throws IOException On errors processing file pointed to by sd + */ + static long readCheckpointTime(StorageDirectory sd) throws IOException { + File timeFile = NNStorage.getStorageFile(sd, NameNodeFile.TIME); + long timeStamp = 0L; + if (timeFile.exists() && timeFile.canRead()) { + DataInputStream in = new DataInputStream(new FileInputStream(timeFile)); + try { + timeStamp = in.readLong(); + } finally { + in.close(); + } + } + return timeStamp; + } + + @Override + boolean isUpgradeFinalized() { + return isUpgradeFinalized; + } + + @Override + LoadPlan createLoadPlan() throws IOException { + // We should have at least one image and one edits dirs + if (latestNameSD == null) + throw new IOException("Image file is not found in " + imageDirs); + if (latestEditsSD == null) + throw new IOException("Edits file is not found in " + editsDirs); + + // Make sure we are loading image and edits from same checkpoint + if (latestNameCheckpointTime > latestEditsCheckpointTime + && latestNameSD != latestEditsSD + && latestNameSD.getStorageDirType() == NameNodeDirType.IMAGE + && latestEditsSD.getStorageDirType() == NameNodeDirType.EDITS) { + // This is a rare failure when NN has image-only and edits-only + // storage directories, and fails right after saving images, + // in some of the storage directories, but before purging edits. + // See -NOTE- in saveNamespace(). + LOG.error("This is a rare failure scenario!!!"); + LOG.error("Image checkpoint time " + latestNameCheckpointTime + + " > edits checkpoint time " + latestEditsCheckpointTime); + LOG.error("Name-node will treat the image as the latest state of " + + "the namespace. Old edits will be discarded."); + } else if (latestNameCheckpointTime != latestEditsCheckpointTime) { + throw new IOException("Inconsistent storage detected, " + + "image and edits checkpoint times do not match. " + + "image checkpoint time = " + latestNameCheckpointTime + + "edits checkpoint time = " + latestEditsCheckpointTime); + } + + return new PreTransactionalLoadPlan(); + } + + @Override + boolean needToSave() { + return hasOutOfDateStorageDirs || + checkpointTimes.size() != 1 || + latestNameCheckpointTime > latestEditsCheckpointTime; + + } + + private class PreTransactionalLoadPlan extends LoadPlan { + + @Override + boolean doRecovery() throws IOException { + LOG.debug( + "Performing recovery in "+ latestNameSD + " and " + latestEditsSD); + + boolean needToSave = false; + File curFile = + NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE); + File ckptFile = + NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE_NEW); + + // + // If we were in the midst of a checkpoint + // + if (ckptFile.exists()) { + needToSave = true; + if (NNStorage.getStorageFile(latestEditsSD, NameNodeFile.EDITS_NEW) + .exists()) { + // + // checkpointing migth have uploaded a new + // merged image, but we discard it here because we are + // not sure whether the entire merged image was uploaded + // before the namenode crashed. + // + if (!ckptFile.delete()) { + throw new IOException("Unable to delete " + ckptFile); + } + } else { + // + // checkpointing was in progress when the namenode + // shutdown. The fsimage.ckpt was created and the edits.new + // file was moved to edits. We complete that checkpoint by + // moving fsimage.new to fsimage. There is no need to + // update the fstime file here. renameTo fails on Windows + // if the destination file already exists. + // + if (!ckptFile.renameTo(curFile)) { + if (!curFile.delete()) + LOG.warn("Unable to delete dir " + curFile + " before rename"); + if (!ckptFile.renameTo(curFile)) { + throw new IOException("Unable to rename " + ckptFile + + " to " + curFile); + } + } + } + } + return needToSave; + } + + @Override + File getImageFile() { + return NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE); + } + + @Override + List getEditsFiles() { + if (latestNameCheckpointTime > latestEditsCheckpointTime) { + // the image is already current, discard edits + LOG.debug( + "Name checkpoint time is newer than edits, not loading edits."); + return Collections.emptyList(); + } + + return getEditsInStorageDir(latestEditsSD); + } + + @Override + StorageDirectory getStorageDirectoryForProperties() { + return latestNameSD; + } + } + + /** + * @return a list with the paths to EDITS and EDITS_NEW (if it exists) + * in a given storage directory. + */ + static List getEditsInStorageDir(StorageDirectory sd) { + ArrayList files = new ArrayList(); + File edits = NNStorage.getStorageFile(sd, NameNodeFile.EDITS); + assert edits.exists() : "Expected edits file at " + edits; + files.add(edits); + File editsNew = NNStorage.getStorageFile(sd, NameNodeFile.EDITS_NEW); + if (editsNew.exists()) { + files.add(editsNew); + } + return files; + } +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java new file mode 100644 index 00000000000..6249f2f5d5d --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.classification.InterfaceStability; +import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; + +/** + * Interface responsible for inspecting a set of storage directories and devising + * a plan to load the namespace from them. + */ +@InterfaceAudience.Private +@InterfaceStability.Unstable +abstract class FSImageStorageInspector { + /** + * Inspect the contents of the given storage directory. + */ + abstract void inspectDirectory(StorageDirectory sd) throws IOException; + + /** + * @return false if any of the storage directories have an unfinalized upgrade + */ + abstract boolean isUpgradeFinalized(); + + /** + * Create a plan to load the image from the set of inspected storage directories. + * @throws IOException if not enough files are available (eg no image found in any directory) + */ + abstract LoadPlan createLoadPlan() throws IOException; + + /** + * @return true if the directories are in such a state that the image should be re-saved + * following the load + */ + abstract boolean needToSave(); + + /** + * A plan to load the namespace from disk, providing the locations from which to load + * the image and a set of edits files. + */ + abstract static class LoadPlan { + /** + * Execute atomic move sequence in the chosen storage directories, + * in order to recover from an interrupted checkpoint. + * @return true if some recovery action was taken + */ + abstract boolean doRecovery() throws IOException; + + /** + * @return the file from which to load the image data + */ + abstract File getImageFile(); + + /** + * @return a list of flies containing edits to replay + */ + abstract List getEditsFiles(); + + /** + * @return the storage directory containing the VERSION file that should be + * loaded. + */ + abstract StorageDirectory getStorageDirectoryForProperties(); + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("Will load image file: ").append(getImageFile()).append("\n"); + sb.append("Will load edits files:").append("\n"); + for (File f : getEditsFiles()) { + sb.append(" ").append(f).append("\n"); + } + sb.append("Will load metadata from: ") + .append(getStorageDirectoryForProperties()) + .append("\n"); + return sb.toString(); + } + } +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java new file mode 100644 index 00000000000..8a2c2aa4db9 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java @@ -0,0 +1,688 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.hdfs.protocol.FSConstants; +import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; +import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.EditLogValidation; +import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; +import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; + +class FSImageTransactionalStorageInspector extends FSImageStorageInspector { + public static final Log LOG = LogFactory.getLog( + FSImageTransactionalStorageInspector.class); + + private boolean needToSave = false; + private boolean isUpgradeFinalized = true; + + List foundImages = new ArrayList(); + List foundEditLogs = new ArrayList(); + SortedMap logGroups = new TreeMap(); + long maxSeenTxId = 0; + + private static final Pattern IMAGE_REGEX = Pattern.compile( + NameNodeFile.IMAGE.getName() + "_(\\d+)"); + private static final Pattern EDITS_REGEX = Pattern.compile( + NameNodeFile.EDITS.getName() + "_(\\d+)-(\\d+)"); + private static final Pattern EDITS_INPROGRESS_REGEX = Pattern.compile( + NameNodeFile.EDITS_INPROGRESS.getName() + "_(\\d+)"); + + @Override + public void inspectDirectory(StorageDirectory sd) throws IOException { + // Was the directory just formatted? + if (!sd.getVersionFile().exists()) { + LOG.info("No version file in " + sd.getRoot()); + needToSave |= true; + return; + } + + File currentDir = sd.getCurrentDir(); + File filesInStorage[]; + try { + filesInStorage = FileUtil.listFiles(currentDir); + } catch (IOException ioe) { + LOG.warn("Unable to inspect storage directory " + currentDir, + ioe); + return; + } + + for (File f : filesInStorage) { + LOG.debug("Checking file " + f); + String name = f.getName(); + + // Check for fsimage_* + Matcher imageMatch = IMAGE_REGEX.matcher(name); + if (imageMatch.matches()) { + if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) { + try { + long txid = Long.valueOf(imageMatch.group(1)); + foundImages.add(new FoundFSImage(sd, f, txid)); + } catch (NumberFormatException nfe) { + LOG.error("Image file " + f + " has improperly formatted " + + "transaction ID"); + // skip + } + } else { + LOG.warn("Found image file at " + f + " but storage directory is " + + "not configured to contain images."); + } + } + } + + + // Check for a seen_txid file, which marks a minimum transaction ID that + // must be included in our load plan. + try { + maxSeenTxId = Math.max(maxSeenTxId, NNStorage.readTransactionIdFile(sd)); + } catch (IOException ioe) { + LOG.warn("Unable to determine the max transaction ID seen by " + sd, ioe); + } + + List editLogs = matchEditLogs(filesInStorage); + if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) { + for (FoundEditLog log : editLogs) { + addEditLog(log); + } + } else if (!editLogs.isEmpty()){ + LOG.warn("Found the following edit log file(s) in " + sd + + " even though it was not configured to store edits:\n" + + " " + Joiner.on("\n ").join(editLogs)); + + } + + // set finalized flag + isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists(); + } + + static List matchEditLogs(File[] filesInStorage) { + List ret = Lists.newArrayList(); + for (File f : filesInStorage) { + String name = f.getName(); + // Check for edits + Matcher editsMatch = EDITS_REGEX.matcher(name); + if (editsMatch.matches()) { + try { + long startTxId = Long.valueOf(editsMatch.group(1)); + long endTxId = Long.valueOf(editsMatch.group(2)); + ret.add(new FoundEditLog(f, startTxId, endTxId)); + } catch (NumberFormatException nfe) { + LOG.error("Edits file " + f + " has improperly formatted " + + "transaction ID"); + // skip + } + } + + // Check for in-progress edits + Matcher inProgressEditsMatch = EDITS_INPROGRESS_REGEX.matcher(name); + if (inProgressEditsMatch.matches()) { + try { + long startTxId = Long.valueOf(inProgressEditsMatch.group(1)); + ret.add( + new FoundEditLog(f, startTxId, FoundEditLog.UNKNOWN_END)); + } catch (NumberFormatException nfe) { + LOG.error("In-progress edits file " + f + " has improperly " + + "formatted transaction ID"); + // skip + } + } + } + return ret; + } + + private void addEditLog(FoundEditLog foundEditLog) { + foundEditLogs.add(foundEditLog); + LogGroup group = logGroups.get(foundEditLog.startTxId); + if (group == null) { + group = new LogGroup(foundEditLog.startTxId); + logGroups.put(foundEditLog.startTxId, group); + } + group.add(foundEditLog); + } + + + @Override + public boolean isUpgradeFinalized() { + return isUpgradeFinalized; + } + + /** + * @return the image that has the most recent associated transaction ID. + * If there are multiple storage directories which contain equal images + * the storage directory that was inspected first will be preferred. + * + * Returns null if no images were found. + */ + FoundFSImage getLatestImage() { + FoundFSImage ret = null; + for (FoundFSImage img : foundImages) { + if (ret == null || img.txId > ret.txId) { + ret = img; + } + } + return ret; + } + + public List getFoundImages() { + return ImmutableList.copyOf(foundImages); + } + + public List getFoundEditLogs() { + return ImmutableList.copyOf(foundEditLogs); + } + + @Override + public LoadPlan createLoadPlan() throws IOException { + if (foundImages.isEmpty()) { + throw new FileNotFoundException("No valid image files found"); + } + + FoundFSImage recoveryImage = getLatestImage(); + LogLoadPlan logPlan = createLogLoadPlan(recoveryImage.txId, Long.MAX_VALUE); + + return new TransactionalLoadPlan(recoveryImage, + logPlan); + } + + /** + * Plan which logs to load in order to bring the namespace up-to-date. + * Transactions will be considered in the range (sinceTxId, maxTxId] + * + * @param sinceTxId the highest txid that is already loaded + * (eg from the image checkpoint) + * @param maxStartTxId ignore any log files that start after this txid + */ + LogLoadPlan createLogLoadPlan(long sinceTxId, long maxStartTxId) throws IOException { + long expectedTxId = sinceTxId + 1; + + List recoveryLogs = new ArrayList(); + + SortedMap tailGroups = logGroups.tailMap(expectedTxId); + if (logGroups.size() > tailGroups.size()) { + LOG.debug("Excluded " + (logGroups.size() - tailGroups.size()) + + " groups of logs because they start with a txid less than image " + + "txid " + sinceTxId); + } + + SortedMap usefulGroups; + if (maxStartTxId > sinceTxId) { + usefulGroups = tailGroups.headMap(maxStartTxId); + } else { + usefulGroups = new TreeMap(); + } + + if (usefulGroups.size() > tailGroups.size()) { + LOG.debug("Excluded " + (tailGroups.size() - usefulGroups.size()) + + " groups of logs because they start with a txid higher than max " + + "txid " + sinceTxId); + } + + + for (Map.Entry entry : usefulGroups.entrySet()) { + long logStartTxId = entry.getKey(); + LogGroup logGroup = entry.getValue(); + + logGroup.planRecovery(); + + if (expectedTxId != FSConstants.INVALID_TXID && logStartTxId != expectedTxId) { + throw new IOException("Expected next log group would start at txid " + + expectedTxId + " but starts at txid " + logStartTxId); + } + + // We can pick any of the non-corrupt logs here + recoveryLogs.add(logGroup.getBestNonCorruptLog()); + + // If this log group was finalized, we know to expect the next + // log group to start at the following txid (ie no gaps) + if (logGroup.hasKnownLastTxId()) { + expectedTxId = logGroup.getLastTxId() + 1; + } else { + // the log group was in-progress so we don't know what ID + // the next group should start from. + expectedTxId = FSConstants.INVALID_TXID; + } + } + + long lastLogGroupStartTxId = usefulGroups.isEmpty() ? + 0 : usefulGroups.lastKey(); + if (maxSeenTxId > sinceTxId && + maxSeenTxId > lastLogGroupStartTxId) { + String msg = "At least one storage directory indicated it has seen a " + + "log segment starting at txid " + maxSeenTxId; + if (usefulGroups.isEmpty()) { + msg += " but there are no logs to load."; + } else { + msg += " but the most recent log file found starts with txid " + + lastLogGroupStartTxId; + } + throw new IOException(msg); + } + + return new LogLoadPlan(recoveryLogs, + Lists.newArrayList(usefulGroups.values())); + + } + + @Override + public boolean needToSave() { + return needToSave; + } + + + RemoteEditLogManifest getEditLogManifest(long sinceTxId) { + List logs = Lists.newArrayList(); + for (LogGroup g : logGroups.values()) { + if (!g.hasFinalized) continue; + + FoundEditLog fel = g.getBestNonCorruptLog(); + if (fel.getLastTxId() < sinceTxId) continue; + + logs.add(new RemoteEditLog(fel.getStartTxId(), + fel.getLastTxId())); + } + + return new RemoteEditLogManifest(logs); + } + + /** + * A group of logs that all start at the same txid. + * + * Handles determining which logs are corrupt and which should be considered + * candidates for loading. + */ + static class LogGroup { + long startTxId; + List logs = new ArrayList();; + private Set endTxIds = new TreeSet(); + private boolean hasInProgress = false; + private boolean hasFinalized = false; + + LogGroup(long startTxId) { + this.startTxId = startTxId; + } + + FoundEditLog getBestNonCorruptLog() { + // First look for non-corrupt finalized logs + for (FoundEditLog log : logs) { + if (!log.isCorrupt() && !log.isInProgress()) { + return log; + } + } + // Then look for non-corrupt in-progress logs + for (FoundEditLog log : logs) { + if (!log.isCorrupt()) { + return log; + } + } + + // We should never get here, because we don't get to the planning stage + // without calling planRecovery first, and if we've called planRecovery, + // we would have already thrown if there were no non-corrupt logs! + throw new IllegalStateException( + "No non-corrupt logs for txid " + startTxId); + } + + /** + * @return true if we can determine the last txid in this log group. + */ + boolean hasKnownLastTxId() { + for (FoundEditLog log : logs) { + if (!log.isInProgress()) { + return true; + } + } + return false; + } + + /** + * @return the last txid included in the logs in this group + * @throws IllegalStateException if it is unknown - + * {@see #hasKnownLastTxId()} + */ + long getLastTxId() { + for (FoundEditLog log : logs) { + if (!log.isInProgress()) { + return log.lastTxId; + } + } + throw new IllegalStateException("LogGroup only has in-progress logs"); + } + + + void add(FoundEditLog log) { + assert log.getStartTxId() == startTxId; + logs.add(log); + + if (log.isInProgress()) { + hasInProgress = true; + } else { + hasFinalized = true; + endTxIds.add(log.lastTxId); + } + } + + void planRecovery() throws IOException { + assert hasInProgress || hasFinalized; + + checkConsistentEndTxIds(); + + if (hasFinalized && hasInProgress) { + planMixedLogRecovery(); + } else if (!hasFinalized && hasInProgress) { + planAllInProgressRecovery(); + } else if (hasFinalized && !hasInProgress) { + LOG.debug("No recovery necessary for logs starting at txid " + + startTxId); + } + } + + /** + * Recovery case for when some logs in the group were in-progress, and + * others were finalized. This happens when one of the storage + * directories fails. + * + * The in-progress logs in this case should be considered corrupt. + */ + private void planMixedLogRecovery() throws IOException { + for (FoundEditLog log : logs) { + if (log.isInProgress()) { + LOG.warn("Log at " + log.getFile() + " is in progress, but " + + "other logs starting at the same txid " + startTxId + + " are finalized. Moving aside."); + log.markCorrupt(); + } + } + } + + /** + * Recovery case for when all of the logs in the group were in progress. + * This happens if the NN completely crashes and restarts. In this case + * we check the non-zero lengths of each log file, and any logs that are + * less than the max of these lengths are considered corrupt. + */ + private void planAllInProgressRecovery() throws IOException { + // We only have in-progress logs. We need to figure out which logs have + // the latest data to reccover them + LOG.warn("Logs beginning at txid " + startTxId + " were are all " + + "in-progress (probably truncated due to a previous NameNode " + + "crash)"); + if (logs.size() == 1) { + // Only one log, it's our only choice! + FoundEditLog log = logs.get(0); + if (log.validateLog().numTransactions == 0) { + // If it has no transactions, we should consider it corrupt just + // to be conservative. + // See comment below for similar case + LOG.warn("Marking log at " + log.getFile() + " as corrupt since " + + "it has no transactions in it."); + log.markCorrupt(); + } + return; + } + + long maxValidTxnCount = Long.MIN_VALUE; + for (FoundEditLog log : logs) { + long validTxnCount = log.validateLog().numTransactions; + LOG.warn(" Log " + log.getFile() + + " valid txns=" + validTxnCount + + " valid len=" + log.validateLog().validLength); + maxValidTxnCount = Math.max(maxValidTxnCount, validTxnCount); + } + + for (FoundEditLog log : logs) { + long txns = log.validateLog().numTransactions; + if (txns < maxValidTxnCount) { + LOG.warn("Marking log at " + log.getFile() + " as corrupt since " + + "it is has only " + txns + " valid txns whereas another " + + "log has " + maxValidTxnCount); + log.markCorrupt(); + } else if (txns == 0) { + // this can happen if the NN crashes right after rolling a log + // but before the START_LOG_SEGMENT txn is written. Since the log + // is empty, we can just move it aside to its corrupt name. + LOG.warn("Marking log at " + log.getFile() + " as corrupt since " + + "it has no transactions in it."); + log.markCorrupt(); + } + } + } + + /** + * Check for the case when we have multiple finalized logs and they have + * different ending transaction IDs. This violates an invariant that all + * log directories should roll together. We should abort in this case. + */ + private void checkConsistentEndTxIds() throws IOException { + if (hasFinalized && endTxIds.size() > 1) { + throw new IOException("More than one ending txid was found " + + "for logs starting at txid " + startTxId + ". " + + "Found: " + StringUtils.join(endTxIds, ',')); + } + } + + void recover() throws IOException { + for (FoundEditLog log : logs) { + if (log.isCorrupt()) { + log.moveAsideCorruptFile(); + } else if (log.isInProgress()) { + log.finalizeLog(); + } + } + } + } + + /** + * Record of an image that has been located and had its filename parsed. + */ + static class FoundFSImage { + final StorageDirectory sd; + final long txId; + private final File file; + + FoundFSImage(StorageDirectory sd, File file, long txId) { + assert txId >= 0 : "Invalid txid on " + file +": " + txId; + + this.sd = sd; + this.txId = txId; + this.file = file; + } + + File getFile() { + return file; + } + + public long getTxId() { + return txId; + } + + @Override + public String toString() { + return file.toString(); + } + } + + /** + * Record of an edit log that has been located and had its filename parsed. + */ + static class FoundEditLog { + File file; + final long startTxId; + long lastTxId; + + private EditLogValidation cachedValidation = null; + private boolean isCorrupt = false; + + static final long UNKNOWN_END = -1; + + FoundEditLog(File file, + long startTxId, long endTxId) { + assert endTxId == UNKNOWN_END || endTxId >= startTxId; + assert startTxId > 0; + assert file != null; + + this.startTxId = startTxId; + this.lastTxId = endTxId; + this.file = file; + } + + public void finalizeLog() throws IOException { + long numTransactions = validateLog().numTransactions; + long lastTxId = startTxId + numTransactions - 1; + File dst = new File(file.getParentFile(), + NNStorage.getFinalizedEditsFileName(startTxId, lastTxId)); + LOG.info("Finalizing edits log " + file + " by renaming to " + + dst.getName()); + if (!file.renameTo(dst)) { + throw new IOException("Couldn't finalize log " + + file + " to " + dst); + } + this.lastTxId = lastTxId; + file = dst; + } + + long getStartTxId() { + return startTxId; + } + + long getLastTxId() { + return lastTxId; + } + + EditLogValidation validateLog() throws IOException { + if (cachedValidation == null) { + cachedValidation = FSEditLogLoader.validateEditLog(file); + } + return cachedValidation; + } + + boolean isInProgress() { + return (lastTxId == UNKNOWN_END); + } + + File getFile() { + return file; + } + + void markCorrupt() { + isCorrupt = true; + } + + boolean isCorrupt() { + return isCorrupt; + } + + void moveAsideCorruptFile() throws IOException { + assert isCorrupt; + + File src = file; + File dst = new File(src.getParent(), src.getName() + ".corrupt"); + boolean success = src.renameTo(dst); + if (!success) { + throw new IOException( + "Couldn't rename corrupt log " + src + " to " + dst); + } + file = dst; + } + + @Override + public String toString() { + return file.toString(); + } + } + + static class TransactionalLoadPlan extends LoadPlan { + final FoundFSImage image; + final LogLoadPlan logPlan; + + public TransactionalLoadPlan(FoundFSImage image, + LogLoadPlan logPlan) { + super(); + this.image = image; + this.logPlan = logPlan; + } + + @Override + boolean doRecovery() throws IOException { + logPlan.doRecovery(); + return false; + } + + @Override + File getImageFile() { + return image.getFile(); + } + + @Override + List getEditsFiles() { + return logPlan.getEditsFiles(); + } + + @Override + StorageDirectory getStorageDirectoryForProperties() { + return image.sd; + } + } + + static class LogLoadPlan { + final List editLogs; + final List logGroupsToRecover; + + LogLoadPlan(List editLogs, + List logGroupsToRecover) { + this.editLogs = editLogs; + this.logGroupsToRecover = logGroupsToRecover; + } + + public void doRecovery() throws IOException { + for (LogGroup g : logGroupsToRecover) { + g.recover(); + } + } + + public List getEditsFiles() { + List ret = new ArrayList(); + for (FoundEditLog log : editLogs) { + ret.add(log.getFile()); + } + return ret; + } + } +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 3b585732881..b3332ce468c 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -105,6 +105,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.UnderReplicatedBlocks; import org.apache.hadoop.hdfs.server.common.GenerationStamp; import org.apache.hadoop.hdfs.server.common.HdfsConstants.BlockUCState; +import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole; import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport; @@ -124,6 +125,7 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand; import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.Text; @@ -323,8 +325,7 @@ private void initialize(Configuration conf, FSImage fsImage) if(fsImage == null) { this.dir = new FSDirectory(this, conf); StartupOption startOpt = NameNode.getStartupOption(conf); - this.dir.loadFSImage(getNamespaceDirs(conf), - getNamespaceEditsDirs(conf), startOpt); + this.dir.loadFSImage(startOpt); long timeTakenToLoadFSImage = now() - systemStart; LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs"); NameNode.getNameNodeMetrics().setFsImageLoadTime( @@ -392,8 +393,9 @@ public static Collection getStorageDirs(Configuration conf, + propertyName + "\" in hdfs-site.xml;" + "\n\t\t- use Backup Node as a persistent and up-to-date storage " + "of the file system meta-data."); - } else if (dirNames.isEmpty()) - dirNames.add("file:///tmp/hadoop/dfs/name"); + } else if (dirNames.isEmpty()) { + dirNames = Collections.singletonList("file:///tmp/hadoop/dfs/name"); + } return Util.stringCollectionAsURIs(dirNames); } @@ -3258,7 +3260,7 @@ void saveNamespace() throws AccessControlException, IOException { throw new IOException("Safe mode should be turned ON " + "in order to create namespace image."); } - getFSImage().saveNamespace(true); + getFSImage().saveNamespace(); LOG.info("New namespace image has been created."); } finally { readUnlock(); @@ -4003,8 +4005,8 @@ String getSafeModeTip() { } } - long getEditLogSize() throws IOException { - return getEditLog().getEditLogSize(); + public long getTransactionID() { + return getEditLog().getSyncTxId(); } CheckpointSignature rollEditLog() throws IOException { @@ -4019,24 +4021,9 @@ CheckpointSignature rollEditLog() throws IOException { writeUnlock(); } } - - /** - * Moves fsimage.ckpt to fsImage and edits.new to edits - * Reopens the new edits file. - * - * @param sig the signature of this checkpoint (old image) - */ - void rollFSImage(CheckpointSignature sig) throws IOException { - writeLock(); - try { - if (isInSafeMode()) { - throw new SafeModeException("Image not rolled", safeMode); - } - LOG.info("Roll FSImage from " + Server.getRemoteAddress()); - getFSImage().rollFSImage(sig, true); - } finally { - writeUnlock(); - } + + public RemoteEditLogManifest getEditLogManifest(long sinceTxId) throws IOException { + return getEditLog().getEditLogManifest(sinceTxId); } NamenodeCommand startCheckpoint( @@ -4516,31 +4503,29 @@ void saveFilesUnderConstruction(DataOutputStream out) throws IOException { } /** - * Register a name-node. - *

    - * Registration is allowed if there is no ongoing streaming to - * another backup node. - * We currently allow only one backup node, but multiple chackpointers - * if there are no backups. + * Register a Backup name-node, verifying that it belongs + * to the correct namespace, and adding it to the set of + * active journals if necessary. * - * @param registration - * @throws IOException + * @param bnReg registration of the new BackupNode + * @param nnReg registration of this NameNode + * @throws IOException if the namespace IDs do not match */ - void registerBackupNode(NamenodeRegistration registration) - throws IOException { + void registerBackupNode(NamenodeRegistration bnReg, + NamenodeRegistration nnReg) throws IOException { writeLock(); try { if(getFSImage().getStorage().getNamespaceID() - != registration.getNamespaceID()) + != bnReg.getNamespaceID()) throw new IOException("Incompatible namespaceIDs: " + " Namenode namespaceID = " + getFSImage().getStorage().getNamespaceID() + "; " - + registration.getRole() + - " node namespaceID = " + registration.getNamespaceID()); - boolean regAllowed = getEditLog().checkBackupRegistration(registration); - if(!regAllowed) - throw new IOException("Registration is not allowed. " + - "Another node is registered as a backup."); + + bnReg.getRole() + + " node namespaceID = " + bnReg.getNamespaceID()); + if (bnReg.getRole() == NamenodeRole.BACKUP) { + getFSImage().getEditLog().registerBackupNode( + bnReg, nnReg); + } } finally { writeUnlock(); } @@ -5080,4 +5065,6 @@ public BlockManager getBlockManager() { void removeDecomNodeFromList(List nodeList) { getBlockManager().getDatanodeManager().removeDecomNodeFromList(nodeList); } + + } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java new file mode 100644 index 00000000000..360a118fa27 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java @@ -0,0 +1,114 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import java.io.File; +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.fs.FileUtil; +import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; +import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.FoundEditLog; +import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; + +/** + * Journal manager for the common case of edits files being written + * to a storage directory. + * + * Note: this class is not thread-safe and should be externally + * synchronized. + */ +class FileJournalManager implements JournalManager { + private static final Log LOG = LogFactory.getLog(FileJournalManager.class); + + private final StorageDirectory sd; + private int outputBufferCapacity = 512*1024; + + public FileJournalManager(StorageDirectory sd) { + this.sd = sd; + } + + @Override + public EditLogOutputStream startLogSegment(long txid) throws IOException { + File newInProgress = NNStorage.getInProgressEditsFile(sd, txid); + EditLogOutputStream stm = new EditLogFileOutputStream(newInProgress, + outputBufferCapacity); + stm.create(); + return stm; + } + + @Override + public void finalizeLogSegment(long firstTxId, long lastTxId) + throws IOException { + File inprogressFile = NNStorage.getInProgressEditsFile( + sd, firstTxId); + File dstFile = NNStorage.getFinalizedEditsFile( + sd, firstTxId, lastTxId); + LOG.debug("Finalizing edits file " + inprogressFile + " -> " + dstFile); + + Preconditions.checkState(!dstFile.exists(), + "Can't finalize edits file " + inprogressFile + " since finalized file " + + "already exists"); + if (!inprogressFile.renameTo(dstFile)) { + throw new IOException("Unable to finalize edits file " + inprogressFile); + } + } + + @VisibleForTesting + public StorageDirectory getStorageDirectory() { + return sd; + } + + @Override + public String toString() { + return "FileJournalManager for storage directory " + sd; + } + + @Override + public void setOutputBufferCapacity(int size) { + this.outputBufferCapacity = size; + } + + @Override + public void purgeLogsOlderThan(long minTxIdToKeep, StoragePurger purger) + throws IOException { + File[] files = FileUtil.listFiles(sd.getCurrentDir()); + List editLogs = + FSImageTransactionalStorageInspector.matchEditLogs(files); + for (FoundEditLog log : editLogs) { + if (log.getStartTxId() < minTxIdToKeep && + log.getLastTxId() < minTxIdToKeep) { + purger.purgeLog(log); + } + } + } + + @Override + public EditLogInputStream getInProgressInputStream(long segmentStartsAtTxId) + throws IOException { + File f = NNStorage.getInProgressEditsFile(sd, segmentStartsAtTxId); + return new EditLogFileInputStream(f); + } + +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java index 04bfe845a43..8753b270f1d 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java @@ -20,6 +20,8 @@ import java.security.PrivilegedExceptionAction; import java.util.*; import java.io.*; +import java.net.InetSocketAddress; + import javax.servlet.ServletContext; import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; @@ -34,11 +36,16 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.server.common.JspHelper; +import org.apache.hadoop.hdfs.server.common.StorageInfo; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.util.DataTransferThrottler; +import org.apache.hadoop.hdfs.util.MD5FileUtils; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.util.StringUtils; +import com.google.common.base.Preconditions; + /** * This class is used in Namesystem's jetty to retrieve a file. * Typically used by the Secondary NameNode to retrieve image and @@ -50,15 +57,21 @@ public class GetImageServlet extends HttpServlet { private static final Log LOG = LogFactory.getLog(GetImageServlet.class); - @SuppressWarnings("unchecked") + private static final String TXID_PARAM = "txid"; + private static final String START_TXID_PARAM = "startTxId"; + private static final String END_TXID_PARAM = "endTxId"; + private static final String STORAGEINFO_PARAM = "storageInfo"; + + private static Set currentlyDownloadingCheckpoints = + Collections.synchronizedSet(new HashSet()); + public void doGet(final HttpServletRequest request, final HttpServletResponse response ) throws ServletException, IOException { - Map pmap = request.getParameterMap(); try { ServletContext context = getServletContext(); final FSImage nnImage = NameNodeHttpServer.getFsImageFromContext(context); - final TransferFsImage ff = new TransferFsImage(pmap, request, response); + final GetImageParams parsedParams = new GetImageParams(request, response); final Configuration conf = (Configuration)getServletContext().getAttribute(JspHelper.CURRENT_CONF); @@ -70,45 +83,77 @@ public void doGet(final HttpServletRequest request, + request.getRemoteHost()); return; } - + + String myStorageInfoString = nnImage.getStorage().toColonSeparatedString(); + String theirStorageInfoString = parsedParams.getStorageInfoString(); + if (theirStorageInfoString != null && + !myStorageInfoString.equals(theirStorageInfoString)) { + response.sendError(HttpServletResponse.SC_FORBIDDEN, + "This namenode has storage info " + myStorageInfoString + + " but the secondary expected " + theirStorageInfoString); + LOG.warn("Received an invalid request file transfer request " + + "from a secondary with storage info " + theirStorageInfoString); + return; + } + UserGroupInformation.getCurrentUser().doAs(new PrivilegedExceptionAction() { @Override public Void run() throws Exception { - if (ff.getImage()) { - response.setHeader(TransferFsImage.CONTENT_LENGTH, - String.valueOf(nnImage.getStorage() - .getFsImageName().length())); - // send fsImage - TransferFsImage.getFileServer(response.getOutputStream(), - nnImage.getStorage().getFsImageName(), - getThrottler(conf)); - } else if (ff.getEdit()) { - response.setHeader(TransferFsImage.CONTENT_LENGTH, - String.valueOf(nnImage.getStorage() - .getFsEditName().length())); - // send edits - TransferFsImage.getFileServer(response.getOutputStream(), - nnImage.getStorage().getFsEditName(), - getThrottler(conf)); - } else if (ff.putImage()) { - // issue a HTTP get request to download the new fsimage - nnImage.validateCheckpointUpload(ff.getToken()); - nnImage.newImageDigest = ff.getNewChecksum(); - MD5Hash downloadImageDigest = reloginIfNecessary().doAs( - new PrivilegedExceptionAction() { - @Override - public MD5Hash run() throws Exception { - return TransferFsImage.getFileClient( - ff.getInfoServer(), "getimage=1", - nnImage.getStorage().getFsImageNameCheckpoint(), true); - } - }); - if (!nnImage.newImageDigest.equals(downloadImageDigest)) { - throw new IOException("The downloaded image is corrupt," + - " expecting a checksum " + nnImage.newImageDigest + - " but received a checksum " + downloadImageDigest); + if (parsedParams.isGetImage()) { + long txid = parsedParams.getTxId(); + File imageFile = nnImage.getStorage().getFsImageName(txid); + if (imageFile == null) { + throw new IOException("Could not find image with txid " + txid); + } + setVerificationHeaders(response, imageFile); + // send fsImage + TransferFsImage.getFileServer(response.getOutputStream(), imageFile, + getThrottler(conf)); + } else if (parsedParams.isGetEdit()) { + long startTxId = parsedParams.getStartTxId(); + long endTxId = parsedParams.getEndTxId(); + + File editFile = nnImage.getStorage() + .findFinalizedEditsFile(startTxId, endTxId); + setVerificationHeaders(response, editFile); + + // send edits + TransferFsImage.getFileServer(response.getOutputStream(), editFile, + getThrottler(conf)); + } else if (parsedParams.isPutImage()) { + final long txid = parsedParams.getTxId(); + + if (! currentlyDownloadingCheckpoints.add(txid)) { + throw new IOException( + "Another checkpointer is already in the process of uploading a" + + " checkpoint made at transaction ID " + txid); + } + + try { + if (nnImage.getStorage().findImageFile(txid) != null) { + throw new IOException( + "Another checkpointer already uploaded an checkpoint " + + "for txid " + txid); + } + + // issue a HTTP get request to download the new fsimage + MD5Hash downloadImageDigest = reloginIfNecessary().doAs( + new PrivilegedExceptionAction() { + @Override + public MD5Hash run() throws Exception { + return TransferFsImage.downloadImageToStorage( + parsedParams.getInfoServer(), txid, + nnImage.getStorage(), true); + } + }); + nnImage.saveDigestAndRenameCheckpointImage(txid, downloadImageDigest); + + // Now that we have a new checkpoint, we might be able to + // remove some old ones. + nnImage.purgeOldStorage(); + } finally { + currentlyDownloadingCheckpoints.remove(txid); } - nnImage.checkpointUploadDone(); } return null; } @@ -182,4 +227,148 @@ protected boolean isValidRequestor(String remoteUser, Configuration conf) if(LOG.isDebugEnabled()) LOG.debug("isValidRequestor is rejecting: " + remoteUser); return false; } + + /** + * Set headers for content length, and, if available, md5. + * @throws IOException + */ + private void setVerificationHeaders(HttpServletResponse response, File file) + throws IOException { + response.setHeader(TransferFsImage.CONTENT_LENGTH, + String.valueOf(file.length())); + MD5Hash hash = MD5FileUtils.readStoredMd5ForFile(file); + if (hash != null) { + response.setHeader(TransferFsImage.MD5_HEADER, hash.toString()); + } + } + + static String getParamStringForImage(long txid, + StorageInfo remoteStorageInfo) { + return "getimage=1&" + TXID_PARAM + "=" + txid + + "&" + STORAGEINFO_PARAM + "=" + + remoteStorageInfo.toColonSeparatedString(); + + } + + static String getParamStringForLog(RemoteEditLog log, + StorageInfo remoteStorageInfo) { + return "getedit=1&" + START_TXID_PARAM + "=" + log.getStartTxId() + + "&" + END_TXID_PARAM + "=" + log.getEndTxId() + + "&" + STORAGEINFO_PARAM + "=" + + remoteStorageInfo.toColonSeparatedString(); + } + + static String getParamStringToPutImage(long txid, + InetSocketAddress imageListenAddress, NNStorage storage) { + + return "putimage=1" + + "&" + TXID_PARAM + "=" + txid + + "&port=" + imageListenAddress.getPort() + + "&machine=" + imageListenAddress.getHostName() + + "&" + STORAGEINFO_PARAM + "=" + + storage.toColonSeparatedString(); + } + + + static class GetImageParams { + private boolean isGetImage; + private boolean isGetEdit; + private boolean isPutImage; + private int remoteport; + private String machineName; + private long startTxId, endTxId, txId; + private String storageInfoString; + + /** + * @param request the object from which this servlet reads the url contents + * @param response the object into which this servlet writes the url contents + * @throws IOException if the request is bad + */ + public GetImageParams(HttpServletRequest request, + HttpServletResponse response + ) throws IOException { + @SuppressWarnings("unchecked") + Map pmap = request.getParameterMap(); + isGetImage = isGetEdit = isPutImage = false; + remoteport = 0; + machineName = null; + + for (Map.Entry entry : pmap.entrySet()) { + String key = entry.getKey(); + String[] val = entry.getValue(); + if (key.equals("getimage")) { + isGetImage = true; + txId = parseLongParam(request, TXID_PARAM); + } else if (key.equals("getedit")) { + isGetEdit = true; + startTxId = parseLongParam(request, START_TXID_PARAM); + endTxId = parseLongParam(request, END_TXID_PARAM); + } else if (key.equals("putimage")) { + isPutImage = true; + txId = parseLongParam(request, TXID_PARAM); + } else if (key.equals("port")) { + remoteport = new Integer(val[0]).intValue(); + } else if (key.equals("machine")) { + machineName = val[0]; + } else if (key.equals(STORAGEINFO_PARAM)) { + storageInfoString = val[0]; + } + } + + int numGets = (isGetImage?1:0) + (isGetEdit?1:0); + if ((numGets > 1) || (numGets == 0) && !isPutImage) { + throw new IOException("Illegal parameters to TransferFsImage"); + } + } + + public String getStorageInfoString() { + return storageInfoString; + } + + public long getTxId() { + Preconditions.checkState(isGetImage || isPutImage); + return txId; + } + + public long getStartTxId() { + Preconditions.checkState(isGetEdit); + return startTxId; + } + + public long getEndTxId() { + Preconditions.checkState(isGetEdit); + return endTxId; + } + + boolean isGetEdit() { + return isGetEdit; + } + + boolean isGetImage() { + return isGetImage; + } + + boolean isPutImage() { + return isPutImage; + } + + String getInfoServer() throws IOException{ + if (machineName == null || remoteport == 0) { + throw new IOException ("MachineName and port undefined"); + } + return machineName + ":" + remoteport; + } + + private static long parseLongParam(HttpServletRequest request, String param) + throws IOException { + // Parse the 'txid' parameter which indicates which image is to be + // fetched. + String paramStr = request.getParameter(param); + if (paramStr == null) { + throw new IOException("Invalid request has no " + param + " parameter"); + } + + return Long.valueOf(paramStr); + } + } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java new file mode 100644 index 00000000000..56ea5c25129 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.IOException; + +import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger; + +/** + * A JournalManager is responsible for managing a single place of storing + * edit logs. It may correspond to multiple files, a backup node, etc. + * Even when the actual underlying storage is rolled, or failed and restored, + * each conceptual place of storage corresponds to exactly one instance of + * this class, which is created when the EditLog is first opened. + */ +interface JournalManager { + /** + * Begin writing to a new segment of the log stream, which starts at + * the given transaction ID. + */ + EditLogOutputStream startLogSegment(long txId) throws IOException; + + /** + * Mark the log segment that spans from firstTxId to lastTxId + * as finalized and complete. + */ + void finalizeLogSegment(long firstTxId, long lastTxId) throws IOException; + + /** + * Set the amount of memory that this stream should use to buffer edits + */ + void setOutputBufferCapacity(int size); + + /** + * The JournalManager may archive/purge any logs for transactions less than + * or equal to minImageTxId. + * + * @param minTxIdToKeep the earliest txid that must be retained after purging + * old logs + * @param purger the purging implementation to use + * @throws IOException if purging fails + */ + void purgeLogsOlderThan(long minTxIdToKeep, StoragePurger purger) + throws IOException; + + /** + * @return an EditLogInputStream that reads from the same log that + * the edit log is currently writing. May return null if this journal + * manager does not support this operation. + */ + EditLogInputStream getInProgressInputStream(long segmentStartsAtTxId) + throws IOException; +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java index 29a638567f4..00461e2fb3f 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java @@ -17,22 +17,20 @@ */ package org.apache.hadoop.hdfs.server.namenode; -import static org.apache.hadoop.hdfs.server.common.Util.now; - +import java.io.BufferedReader; import java.io.Closeable; -import java.io.DataInputStream; -import java.io.DataOutputStream; import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; +import java.io.FileReader; import java.io.IOException; import java.io.RandomAccessFile; +import java.io.OutputStream; import java.net.URI; import java.net.UnknownHostException; import java.security.NoSuchAlgorithmException; import java.security.SecureRandom; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Properties; @@ -51,13 +49,17 @@ import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.hdfs.server.common.Storage; -import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.common.UpgradeManager; import org.apache.hadoop.hdfs.server.common.Util; import org.apache.hadoop.hdfs.server.namenode.JournalStream.JournalType; -import org.apache.hadoop.io.MD5Hash; +import org.apache.hadoop.hdfs.util.AtomicFileOutputStream; + +import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.net.DNS; +import com.google.common.base.Preconditions; +import com.google.common.annotations.VisibleForTesting; + /** * NNStorage is responsible for management of the StorageDirectories used by * the NameNode. @@ -66,17 +68,19 @@ public class NNStorage extends Storage implements Closeable { private static final Log LOG = LogFactory.getLog(NNStorage.class.getName()); - static final String MESSAGE_DIGEST_PROPERTY = "imageMD5Digest"; - + static final String DEPRECATED_MESSAGE_DIGEST_PROPERTY = "imageMD5Digest"; + // // The filenames used for storing the images // enum NameNodeFile { IMAGE ("fsimage"), - TIME ("fstime"), + TIME ("fstime"), // from "old" pre-HDFS-1073 format + SEEN_TXID ("seen_txid"), EDITS ("edits"), IMAGE_NEW ("fsimage.ckpt"), - EDITS_NEW ("edits.new"); + EDITS_NEW ("edits.new"), // from "old" pre-HDFS-1073 format + EDITS_INPROGRESS ("edits_inprogress"); private String fileName = null; private NameNodeFile(String name) { this.fileName = name; } @@ -106,42 +110,9 @@ public boolean isOfType(StorageDirType type) { } } - /** - * Interface to be implemented by classes which make use of storage - * directories. They are notified when a StorageDirectory is causing errors, - * becoming available or being formatted. - * - * This allows the implementors of the interface take their own specific - * action on the StorageDirectory when this occurs. - */ - interface NNStorageListener { - /** - * An error has occurred with a StorageDirectory. - * @param sd The storage directory causing the error. - * @throws IOException - */ - void errorOccurred(StorageDirectory sd) throws IOException; - - /** - * A storage directory has been formatted. - * @param sd The storage directory being formatted. - * @throws IOException - */ - void formatOccurred(StorageDirectory sd) throws IOException; - - /** - * A storage directory is now available use. - * @param sd The storage directory which has become available. - * @throws IOException - */ - void directoryAvailable(StorageDirectory sd) throws IOException; - } - - final private List listeners; private UpgradeManager upgradeManager = null; - protected MD5Hash imageDigest = null; protected String blockpoolID = ""; // id of the block pool - + /** * flag that controls if we try to restore failed storages */ @@ -149,7 +120,13 @@ interface NNStorageListener { private Object restorationLock = new Object(); private boolean disablePreUpgradableLayoutCheck = false; - private long checkpointTime = -1L; // The age of the image + + /** + * TxId of the last transaction that was included in the most + * recent fsimage file. This does not include any transactions + * that have since been written to the edit log. + */ + protected long mostRecentCheckpointTxId = FSConstants.INVALID_TXID; /** * list of failed (and thus removed) storages @@ -158,27 +135,26 @@ interface NNStorageListener { = new CopyOnWriteArrayList(); /** - * Construct the NNStorage. - * @param conf Namenode configuration. + * Properties from old layout versions that may be needed + * during upgrade only. */ - public NNStorage(Configuration conf) { - super(NodeType.NAME_NODE); - - storageDirs = new CopyOnWriteArrayList(); - this.listeners = new CopyOnWriteArrayList(); - } + private HashMap deprecatedProperties; /** * Construct the NNStorage. - * @param storageInfo storage information - * @param bpid block pool Id + * @param conf Namenode configuration. + * @param imageDirs Directories the image can be stored in. + * @param editsDirs Directories the editlog can be stored in. + * @throws IOException if any directories are inaccessible. */ - public NNStorage(StorageInfo storageInfo, String bpid) { - super(NodeType.NAME_NODE, storageInfo); + public NNStorage(Configuration conf, + Collection imageDirs, Collection editsDirs) + throws IOException { + super(NodeType.NAME_NODE); storageDirs = new CopyOnWriteArrayList(); - this.listeners = new CopyOnWriteArrayList(); - this.blockpoolID = bpid; + + setStorageDirectories(imageDirs, editsDirs); } @Override // Storage @@ -207,7 +183,6 @@ public boolean isPreUpgradableLayout(StorageDirectory sd) throws IOException { @Override // Closeable public void close() throws IOException { - listeners.clear(); unlockAll(); storageDirs.clear(); } @@ -232,10 +207,7 @@ boolean getRestoreFailedStorage() { /** * See if any of removed storages is "writable" again, and can be returned - * into service. If saveNamespace is set, then this method is being - * called from saveNamespace. - * - * @param saveNamespace Whether method is being called from saveNamespace() + * into service. */ void attemptRestoreRemovedStorage() { // if directory is "alive" - copy the images there... @@ -253,23 +225,10 @@ void attemptRestoreRemovedStorage() { LOG.info("currently disabled dir " + root.getAbsolutePath() + "; type="+sd.getStorageDirType() + ";canwrite="+root.canWrite()); - try { - - if(root.exists() && root.canWrite()) { - // when we try to restore we just need to remove all the data - // without saving current in-memory state (which could've changed). - sd.clearDirectory(); - - LOG.info("restoring dir " + sd.getRoot().getAbsolutePath()); - for (NNStorageListener listener : listeners) { - listener.directoryAvailable(sd); - } - - this.addStorageDir(sd); // restore - this.removedStorageDirs.remove(sd); - } - } catch(IOException e) { - LOG.warn("failed to restore " + sd.getRoot().getAbsolutePath(), e); + if(root.exists() && root.canWrite()) { + LOG.info("restoring dir " + sd.getRoot().getAbsolutePath()); + this.addStorageDir(sd); // restore + this.removedStorageDirs.remove(sd); } } } @@ -283,9 +242,11 @@ List getRemovedStorageDirs() { } /** - * Set the storage directories which will be used. NNStorage.close() should - * be called before this to ensure any previous storage directories have been - * freed. + * Set the storage directories which will be used. This should only ever be + * called from inside NNStorage. However, it needs to remain package private + * for testing, as StorageDirectories need to be reinitialised after using + * Mockito.spy() on this class, as Mockito doesn't work well with inner + * classes, such as StorageDirectory in this case. * * Synchronized due to initialization of storageDirs and removedStorageDirs. * @@ -293,6 +254,7 @@ List getRemovedStorageDirs() { * @param fsEditsDirs Locations to store edit logs. * @throws IOException */ + @VisibleForTesting synchronized void setStorageDirectories(Collection fsNameDirs, Collection fsEditsDirs) throws IOException { @@ -411,110 +373,84 @@ Collection getDirectories(NameNodeDirType dirType) } return list; } - + /** - * Determine the checkpoint time of the specified StorageDirectory + * Determine the last transaction ID noted in this storage directory. + * This txid is stored in a special seen_txid file since it might not + * correspond to the latest image or edit log. For example, an image-only + * directory will have this txid incremented when edits logs roll, even + * though the edits logs are in a different directory. * * @param sd StorageDirectory to check - * @return If file exists and can be read, last checkpoint time. If not, 0L. + * @return If file exists and can be read, last recorded txid. If not, 0L. * @throws IOException On errors processing file pointed to by sd */ - long readCheckpointTime(StorageDirectory sd) throws IOException { - File timeFile = getStorageFile(sd, NameNodeFile.TIME); - long timeStamp = 0L; - if (timeFile.exists() && timeFile.canRead()) { - DataInputStream in = new DataInputStream(new FileInputStream(timeFile)); + static long readTransactionIdFile(StorageDirectory sd) throws IOException { + File txidFile = getStorageFile(sd, NameNodeFile.SEEN_TXID); + long txid = 0L; + if (txidFile.exists() && txidFile.canRead()) { + BufferedReader br = new BufferedReader(new FileReader(txidFile)); try { - timeStamp = in.readLong(); + txid = Long.valueOf(br.readLine()); } finally { - in.close(); + IOUtils.cleanup(LOG, br); } } - return timeStamp; + return txid; } - + /** * Write last checkpoint time into a separate file. * * @param sd * @throws IOException */ - public void writeCheckpointTime(StorageDirectory sd) throws IOException { - if (checkpointTime < 0L) - return; // do not write negative time - File timeFile = getStorageFile(sd, NameNodeFile.TIME); - if (timeFile.exists() && ! timeFile.delete()) { - LOG.error("Cannot delete chekpoint time file: " - + timeFile.getCanonicalPath()); - } - FileOutputStream fos = new FileOutputStream(timeFile); - DataOutputStream out = new DataOutputStream(fos); + void writeTransactionIdFile(StorageDirectory sd, long txid) throws IOException { + Preconditions.checkArgument(txid >= 0, "bad txid: " + txid); + + File txIdFile = getStorageFile(sd, NameNodeFile.SEEN_TXID); + OutputStream fos = new AtomicFileOutputStream(txIdFile); try { - out.writeLong(checkpointTime); - out.flush(); - fos.getChannel().force(true); + fos.write(String.valueOf(txid).getBytes()); + fos.write('\n'); } finally { - out.close(); + IOUtils.cleanup(LOG, fos); } } /** - * Record new checkpoint time in order to - * distinguish healthy directories from the removed ones. - * If there is an error writing new checkpoint time, the corresponding - * storage directory is removed from the list. + * Set the transaction ID of the last checkpoint */ - public void incrementCheckpointTime() { - setCheckpointTimeInStorage(checkpointTime + 1); + void setMostRecentCheckpointTxId(long txid) { + this.mostRecentCheckpointTxId = txid; } /** - * The age of the namespace state.

    - * Reflects the latest time the image was saved. - * Modified with every save or a checkpoint. - * Persisted in VERSION file. - * - * @return the current checkpoint time. + * Return the transaction ID of the last checkpoint. */ - public long getCheckpointTime() { - return checkpointTime; + long getMostRecentCheckpointTxId() { + return mostRecentCheckpointTxId; } /** - * Set the checkpoint time. - * - * This method does not persist the checkpoint time to storage immediately. + * Write a small file in all available storage directories that + * indicates that the namespace has reached some given transaction ID. * - * @see #setCheckpointTimeInStorage - * @param newCpT the new checkpoint time. + * This is used when the image is loaded to avoid accidental rollbacks + * in the case where an edit log is fully deleted but there is no + * checkpoint. See TestNameEditsConfigs.testNameEditsConfigsFailure() + * @param txid the txid that has been reached */ - public void setCheckpointTime(long newCpT) { - checkpointTime = newCpT; - } - - /** - * Set the current checkpoint time. Writes the new checkpoint - * time to all available storage directories. - * @param newCpT The new checkpoint time. - */ - public void setCheckpointTimeInStorage(long newCpT) { - checkpointTime = newCpT; - // Write new checkpoint time in all storage directories - for(Iterator it = - dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); + public void writeTransactionIdFileToStorage(long txid) { + // Write txid marker in all storage directories + for (StorageDirectory sd : storageDirs) { try { - writeCheckpointTime(sd); + writeTransactionIdFile(sd, txid); } catch(IOException e) { // Close any edits stream associated with this dir and remove directory - LOG.warn("incrementCheckpointTime failed on " - + sd.getRoot().getPath() + ";type="+sd.getStorageDirType()); - try { - reportErrorsOnDirectory(sd); - } catch (IOException ioe) { - LOG.error("Failed to report and remove NN storage directory " - + sd.getRoot().getPath(), ioe); - } + LOG.warn("writeTransactionIdToStorage failed on " + sd, + e); + reportErrorsOnDirectory(sd); } } } @@ -525,11 +461,11 @@ public void setCheckpointTimeInStorage(long newCpT) { * * @return List of filenames to save checkpoints to. */ - public File[] getFsImageNameCheckpoint() { + public File[] getFsImageNameCheckpoint(long txid) { ArrayList list = new ArrayList(); for (Iterator it = dirIterator(NameNodeDirType.IMAGE); it.hasNext();) { - list.add(getStorageFile(it.next(), NameNodeFile.IMAGE_NEW)); + list.add(getStorageFile(it.next(), NameNodeFile.IMAGE_NEW, txid)); } return list.toArray(new File[list.size()]); } @@ -538,51 +474,24 @@ public File[] getFsImageNameCheckpoint() { * Return the name of the image file. * @return The name of the first image file. */ - public File getFsImageName() { + public File getFsImageName(long txid) { StorageDirectory sd = null; for (Iterator it = dirIterator(NameNodeDirType.IMAGE); it.hasNext();) { sd = it.next(); - File fsImage = getStorageFile(sd, NameNodeFile.IMAGE); + File fsImage = getStorageFile(sd, NameNodeFile.IMAGE, txid); if(sd.getRoot().canRead() && fsImage.exists()) return fsImage; } return null; } - /** - * @return The name of the first editlog file. - */ - public File getFsEditName() throws IOException { - for (Iterator it - = dirIterator(NameNodeDirType.EDITS); it.hasNext();) { - StorageDirectory sd = it.next(); - if(sd.getRoot().canRead()) - return getEditFile(sd); - } - return null; - } - - /** - * @return The name of the first time file. - */ - public File getFsTimeName() { - StorageDirectory sd = null; - // NameNodeFile.TIME shoul be same on all directories - for (Iterator it = - dirIterator(); it.hasNext();) - sd = it.next(); - return getStorageFile(sd, NameNodeFile.TIME); - } - /** Create new dfs name directory. Caution: this destroys all files * in this filesystem. */ private void format(StorageDirectory sd) throws IOException { sd.clearDirectory(); // create currrent dir - for (NNStorageListener listener : listeners) { - listener.formatOccurred(sd); - } writeProperties(sd); + writeTransactionIdFile(sd, 0); LOG.info("Storage directory " + sd.getRoot() + " has been successfully formatted."); @@ -597,7 +506,6 @@ public void format(String clusterId) throws IOException { this.clusterID = clusterId; this.blockpoolID = newBlockPoolID(); this.cTime = 0L; - this.setCheckpointTime(now()); for (Iterator it = dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); @@ -624,50 +532,6 @@ private int newNamespaceID() { return newID; } - - /** - * Move {@code current} to {@code lastcheckpoint.tmp} and - * recreate empty {@code current}. - * {@code current} is moved only if it is well formatted, - * that is contains VERSION file. - * - * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getLastCheckpointTmp() - * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getPreviousCheckpoint() - */ - protected void moveCurrent(StorageDirectory sd) - throws IOException { - File curDir = sd.getCurrentDir(); - File tmpCkptDir = sd.getLastCheckpointTmp(); - // mv current -> lastcheckpoint.tmp - // only if current is formatted - has VERSION file - if(sd.getVersionFile().exists()) { - assert curDir.exists() : curDir + " directory must exist."; - assert !tmpCkptDir.exists() : tmpCkptDir + " directory must not exist."; - rename(curDir, tmpCkptDir); - } - // recreate current - if(!curDir.exists() && !curDir.mkdir()) - throw new IOException("Cannot create directory " + curDir); - } - - /** - * Move {@code lastcheckpoint.tmp} to {@code previous.checkpoint} - * - * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getPreviousCheckpoint() - * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getLastCheckpointTmp() - */ - protected void moveLastCheckpoint(StorageDirectory sd) - throws IOException { - File tmpCkptDir = sd.getLastCheckpointTmp(); - File prevCkptDir = sd.getPreviousCheckpoint(); - // remove previous.checkpoint - if (prevCkptDir.exists()) - deleteDir(prevCkptDir); - // mv lastcheckpoint.tmp -> previous.checkpoint - if(tmpCkptDir.exists()) - rename(tmpCkptDir, prevCkptDir); - } - @Override // Storage protected void setFieldsFromProperties( Properties props, StorageDirectory sd) throws IOException { @@ -689,26 +553,35 @@ protected void setFieldsFromProperties( setDistributedUpgradeState( sDUS == null? false : Boolean.parseBoolean(sDUS), sDUV == null? getLayoutVersion() : Integer.parseInt(sDUV)); - - String sMd5 = props.getProperty(MESSAGE_DIGEST_PROPERTY); - if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM, layoutVersion)) { - if (sMd5 == null) { - throw new InconsistentFSStateException(sd.getRoot(), - "file " + STORAGE_FILE_VERSION - + " does not have MD5 image digest."); - } - this.imageDigest = new MD5Hash(sMd5); - } else if (sMd5 != null) { - throw new InconsistentFSStateException(sd.getRoot(), - "file " + STORAGE_FILE_VERSION + - " has image MD5 digest when version is " + layoutVersion); - } - - this.setCheckpointTime(readCheckpointTime(sd)); + setDeprecatedPropertiesForUpgrade(props); } /** - * Write last checkpoint time and version file into the storage directory. + * Pull any properties out of the VERSION file that are from older + * versions of HDFS and only necessary during upgrade. + */ + private void setDeprecatedPropertiesForUpgrade(Properties props) { + deprecatedProperties = new HashMap(); + String md5 = props.getProperty(DEPRECATED_MESSAGE_DIGEST_PROPERTY); + if (md5 != null) { + deprecatedProperties.put(DEPRECATED_MESSAGE_DIGEST_PROPERTY, md5); + } + } + + /** + * Return a property that was stored in an earlier version of HDFS. + * + * This should only be used during upgrades. + */ + String getDeprecatedProperty(String prop) { + assert getLayoutVersion() > FSConstants.LAYOUT_VERSION : + "getDeprecatedProperty should only be done when loading " + + "storage from past versions during upgrade."; + return deprecatedProperties.get(prop); + } + + /** + * Write version file into the storage directory. * * The version file should always be written last. * Missing or corrupted version file indicates that @@ -733,50 +606,109 @@ protected void setPropertiesFromFields(Properties props, props.setProperty("distributedUpgradeVersion", Integer.toString(uVersion)); } - if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM, layoutVersion)) { - // Though the current NN supports this feature, this function - // is called with old layoutVersions from the upgrade tests. - if (imageDigest == null) { - // May be null on the first save after an upgrade. - imageDigest = MD5Hash.digest( - new FileInputStream(getStorageFile(sd, NameNodeFile.IMAGE))); - } - props.setProperty(MESSAGE_DIGEST_PROPERTY, imageDigest.toString()); - } - - writeCheckpointTime(sd); } - + + static File getStorageFile(StorageDirectory sd, NameNodeFile type, long imageTxId) { + return new File(sd.getCurrentDir(), + String.format("%s_%019d", type.getName(), imageTxId)); + } + /** - * @return A File of 'type' in storage directory 'sd'. + * Get a storage file for one of the files that doesn't need a txid associated + * (e.g version, seen_txid) */ static File getStorageFile(StorageDirectory sd, NameNodeFile type) { return new File(sd.getCurrentDir(), type.getName()); } + @VisibleForTesting + public static String getCheckpointImageFileName(long txid) { + return String.format("%s_%019d", + NameNodeFile.IMAGE_NEW.getName(), txid); + } + + @VisibleForTesting + public static String getImageFileName(long txid) { + return String.format("%s_%019d", + NameNodeFile.IMAGE.getName(), txid); + } + + @VisibleForTesting + public static String getInProgressEditsFileName(long startTxId) { + return String.format("%s_%019d", NameNodeFile.EDITS_INPROGRESS.getName(), + startTxId); + } + + static File getInProgressEditsFile(StorageDirectory sd, long startTxId) { + return new File(sd.getCurrentDir(), getInProgressEditsFileName(startTxId)); + } + + static File getFinalizedEditsFile(StorageDirectory sd, + long startTxId, long endTxId) { + return new File(sd.getCurrentDir(), + getFinalizedEditsFileName(startTxId, endTxId)); + } + + static File getImageFile(StorageDirectory sd, long txid) { + return new File(sd.getCurrentDir(), + getImageFileName(txid)); + } + + @VisibleForTesting + public static String getFinalizedEditsFileName(long startTxId, long endTxId) { + return String.format("%s_%019d-%019d", NameNodeFile.EDITS.getName(), + startTxId, endTxId); + } + /** - * @return A editlog File in storage directory 'sd'. + * Return the first readable finalized edits file for the given txid. */ - File getEditFile(StorageDirectory sd) { - return getStorageFile(sd, NameNodeFile.EDITS); + File findFinalizedEditsFile(long startTxId, long endTxId) + throws IOException { + File ret = findFile(NameNodeDirType.EDITS, + getFinalizedEditsFileName(startTxId, endTxId)); + if (ret == null) { + throw new IOException( + "No edits file for txid " + startTxId + "-" + endTxId + " exists!"); + } + return ret; + } + + /** + * Return the first readable image file for the given txid, or null + * if no such image can be found + */ + File findImageFile(long txid) throws IOException { + return findFile(NameNodeDirType.IMAGE, + getImageFileName(txid)); } /** - * @return A temporary editlog File in storage directory 'sd'. + * Return the first readable storage file of the given name + * across any of the 'current' directories in SDs of the + * given type, or null if no such file exists. */ - File getEditNewFile(StorageDirectory sd) { - return getStorageFile(sd, NameNodeFile.EDITS_NEW); + private File findFile(NameNodeDirType dirType, String name) { + for (StorageDirectory sd : dirIterable(dirType)) { + File candidate = new File(sd.getCurrentDir(), name); + if (sd.getCurrentDir().canRead() && + candidate.exists()) { + return candidate; + } + } + return null; } /** - * @return A list of all Files of 'type' in available storage directories. + * @return A list of the given File in every available storage directory, + * regardless of whether it might exist. */ - Collection getFiles(NameNodeFile type, NameNodeDirType dirType) { + List getFiles(NameNodeDirType dirType, String fileName) { ArrayList list = new ArrayList(); Iterator it = (dirType == null) ? dirIterator() : dirIterator(dirType); for ( ;it.hasNext(); ) { - list.add(getStorageFile(it.next(), type)); + list.add(new File(it.next().getCurrentDir(), fileName)); } return list; } @@ -809,7 +741,9 @@ int getDistributedUpgradeVersion() { * @param uVersion the new version. */ private void setDistributedUpgradeState(boolean uState, int uVersion) { - upgradeManager.setUpgradeState(uState, uVersion); + if (upgradeManager != null) { + upgradeManager.setUpgradeState(uState, uVersion); + } } /** @@ -849,33 +783,6 @@ void initializeDistributedUpgrade() throws IOException { + FSConstants.LAYOUT_VERSION + " is initialized."); } - /** - * Set the digest for the latest image stored by NNStorage. - * @param digest The digest for the image. - */ - void setImageDigest(MD5Hash digest) { - this.imageDigest = digest; - } - - /** - * Get the digest for the latest image storage by NNStorage. - * @return The digest for the latest image. - */ - MD5Hash getImageDigest() { - return imageDigest; - } - - /** - * Register a listener. The listener will be notified of changes to the list - * of available storage directories. - * - * @see NNStorageListener - * @param sel A storage listener. - */ - void registerListener(NNStorageListener sel) { - listeners.add(sel); - } - /** * Disable the check for pre-upgradable layouts. Needed for BackupImage. * @param val Whether to disable the preupgradeable layout check. @@ -890,7 +797,7 @@ void setDisablePreUpgradableLayoutCheck(boolean val) { * @param sds A list of storage directories to mark as errored. * @throws IOException */ - void reportErrorsOnDirectories(List sds) throws IOException { + void reportErrorsOnDirectories(List sds) { for (StorageDirectory sd : sds) { reportErrorsOnDirectory(sd); } @@ -904,17 +811,12 @@ void reportErrorsOnDirectories(List sds) throws IOException { * @param sd A storage directory to mark as errored. * @throws IOException */ - void reportErrorsOnDirectory(StorageDirectory sd) - throws IOException { + void reportErrorsOnDirectory(StorageDirectory sd) { LOG.error("Error reported on storage directory " + sd); String lsd = listStorageDirectories(); LOG.debug("current list of storage dirs:" + lsd); - for (NNStorageListener listener : listeners) { - listener.errorOccurred(sd); - } - LOG.warn("About to remove corresponding storage: " + sd.getRoot().getAbsolutePath()); try { @@ -927,8 +829,7 @@ void reportErrorsOnDirectory(StorageDirectory sd) if (this.storageDirs.remove(sd)) { this.removedStorageDirs.add(sd); } - incrementCheckpointTime(); - + lsd = listStorageDirectories(); LOG.debug("at the end current list of storage dirs:" + lsd); } @@ -967,6 +868,29 @@ void processStartupOptionsForUpgrade(StartupOption startOpt, int layoutVersion) } } + /** + * Report that an IOE has occurred on some file which may + * or may not be within one of the NN image storage directories. + */ + void reportErrorOnFile(File f) { + // We use getAbsolutePath here instead of getCanonicalPath since we know + // that there is some IO problem on that drive. + // getCanonicalPath may need to call stat() or readlink() and it's likely + // those calls would fail due to the same underlying IO problem. + String absPath = f.getAbsolutePath(); + for (StorageDirectory sd : storageDirs) { + String dirPath = sd.getRoot().getAbsolutePath(); + if (!dirPath.endsWith("/")) { + dirPath += "/"; + } + if (absPath.startsWith(dirPath)) { + reportErrorsOnDirectory(sd); + return; + } + } + + } + /** * Generate new clusterID. * @@ -1065,4 +989,67 @@ private void setBlockPoolID(File storage, String bpid) public String getBlockPoolID() { return blockpoolID; } + + /** + * Iterate over all current storage directories, inspecting them + * with the given inspector. + */ + void inspectStorageDirs(FSImageStorageInspector inspector) + throws IOException { + + // Process each of the storage directories to find the pair of + // newest image file and edit file + for (Iterator it = dirIterator(); it.hasNext();) { + StorageDirectory sd = it.next(); + inspector.inspectDirectory(sd); + } + } + + /** + * Iterate over all of the storage dirs, reading their contents to determine + * their layout versions. Returns an FSImageStorageInspector which has + * inspected each directory. + * + * Note: this can mutate the storage info fields (ctime, version, etc). + * @throws IOException if no valid storage dirs are found + */ + FSImageStorageInspector readAndInspectDirs() + throws IOException { + int minLayoutVersion = Integer.MAX_VALUE; // the newest + int maxLayoutVersion = Integer.MIN_VALUE; // the oldest + + // First determine what range of layout versions we're going to inspect + for (Iterator it = dirIterator(); + it.hasNext();) { + StorageDirectory sd = it.next(); + if (!sd.getVersionFile().exists()) { + FSImage.LOG.warn("Storage directory " + sd + " contains no VERSION file. Skipping..."); + continue; + } + readProperties(sd); // sets layoutVersion + minLayoutVersion = Math.min(minLayoutVersion, getLayoutVersion()); + maxLayoutVersion = Math.max(maxLayoutVersion, getLayoutVersion()); + } + + if (minLayoutVersion > maxLayoutVersion) { + throw new IOException("No storage directories contained VERSION information"); + } + assert minLayoutVersion <= maxLayoutVersion; + + // If we have any storage directories with the new layout version + // (ie edits_) then use the new inspector, which will ignore + // the old format dirs. + FSImageStorageInspector inspector; + if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, minLayoutVersion)) { + inspector = new FSImageTransactionalStorageInspector(); + if (!LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, maxLayoutVersion)) { + FSImage.LOG.warn("Ignoring one or more storage directories with old layouts"); + } + } else { + inspector = new FSImagePreTransactionalStorageInspector(); + } + + inspectStorageDirs(inspector); + return inspector; + } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java new file mode 100644 index 00000000000..4b5f9a90902 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java @@ -0,0 +1,151 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.namenode; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.TreeSet; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.FoundEditLog; +import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.FoundFSImage; +import org.apache.hadoop.hdfs.util.MD5FileUtils; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +/** + * The NNStorageRetentionManager is responsible for inspecting the storage + * directories of the NN and enforcing a retention policy on checkpoints + * and edit logs. + * + * It delegates the actual removal of files to a StoragePurger + * implementation, which might delete the files or instead copy them to + * a filer or HDFS for later analysis. + */ +public class NNStorageRetentionManager { + + private final int numCheckpointsToRetain; + private static final Log LOG = LogFactory.getLog( + NNStorageRetentionManager.class); + private final NNStorage storage; + private final StoragePurger purger; + private final FSEditLog editLog; + + public NNStorageRetentionManager( + Configuration conf, + NNStorage storage, + FSEditLog editLog, + StoragePurger purger) { + this.numCheckpointsToRetain = conf.getInt( + DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY, + DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT); + this.storage = storage; + this.editLog = editLog; + this.purger = purger; + } + + public NNStorageRetentionManager(Configuration conf, NNStorage storage, + FSEditLog editLog) { + this(conf, storage, editLog, new DeletionStoragePurger()); + } + + public void purgeOldStorage() throws IOException { + FSImageTransactionalStorageInspector inspector = + new FSImageTransactionalStorageInspector(); + storage.inspectStorageDirs(inspector); + + long minImageTxId = getImageTxIdToRetain(inspector); + purgeCheckpointsOlderThan(inspector, minImageTxId); + // If fsimage_N is the image we want to keep, then we need to keep + // all txns > N. We can remove anything < N+1, since fsimage_N + // reflects the state up to and including N. + editLog.purgeLogsOlderThan(minImageTxId + 1, purger); + } + + private void purgeCheckpointsOlderThan( + FSImageTransactionalStorageInspector inspector, + long minTxId) { + for (FoundFSImage image : inspector.getFoundImages()) { + if (image.getTxId() < minTxId) { + LOG.info("Purging old image " + image); + purger.purgeImage(image); + } + } + } + + /** + * @param inspector inspector that has already inspected all storage dirs + * @return the transaction ID corresponding to the oldest checkpoint + * that should be retained. + */ + private long getImageTxIdToRetain(FSImageTransactionalStorageInspector inspector) { + + List images = inspector.getFoundImages(); + TreeSet imageTxIds = Sets.newTreeSet(); + for (FoundFSImage image : images) { + imageTxIds.add(image.getTxId()); + } + + List imageTxIdsList = Lists.newArrayList(imageTxIds); + if (imageTxIdsList.isEmpty()) { + return 0; + } + + Collections.reverse(imageTxIdsList); + int toRetain = Math.min(numCheckpointsToRetain, imageTxIdsList.size()); + long minTxId = imageTxIdsList.get(toRetain - 1); + LOG.info("Going to retain " + toRetain + " images with txid >= " + + minTxId); + return minTxId; + } + + /** + * Interface responsible for disposing of old checkpoints and edit logs. + */ + static interface StoragePurger { + void purgeLog(FoundEditLog log); + void purgeImage(FoundFSImage image); + } + + static class DeletionStoragePurger implements StoragePurger { + @Override + public void purgeLog(FoundEditLog log) { + deleteOrWarn(log.getFile()); + } + + @Override + public void purgeImage(FoundFSImage image) { + deleteOrWarn(image.getFile()); + deleteOrWarn(MD5FileUtils.getDigestFileForFile(image.getFile())); + } + + private static void deleteOrWarn(File file) { + if (!file.delete()) { + // It's OK if we fail to delete something -- we'll catch it + // next time we swing through this directory. + LOG.warn("Could not delete " + file); + } + } + } +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java index 4192c42af25..7450afc03aa 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java @@ -78,6 +78,7 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.NodeRegistration; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand; import org.apache.hadoop.io.EnumSetWritable; import org.apache.hadoop.io.Text; @@ -382,7 +383,7 @@ NamenodeRegistration setRegistration() { nodeRegistration = new NamenodeRegistration( getHostPortString(rpcAddress), getHostPortString(getHttpAddress()), - getFSImage().getStorage(), getRole(), getFSImage().getStorage().getCheckpointTime()); + getFSImage().getStorage(), getRole()); return nodeRegistration; } @@ -647,8 +648,9 @@ public void errorReport(NamenodeRegistration registration, public NamenodeRegistration register(NamenodeRegistration registration) throws IOException { verifyVersion(registration.getVersion()); - namesystem.registerBackupNode(registration); - return setRegistration(); + NamenodeRegistration myRegistration = setRegistration(); + namesystem.registerBackupNode(registration, myRegistration); + return myRegistration; } @Override // NamenodeProtocol @@ -669,22 +671,6 @@ public void endCheckpoint(NamenodeRegistration registration, namesystem.endCheckpoint(registration, sig); } - @Override // NamenodeProtocol - public long journalSize(NamenodeRegistration registration) - throws IOException { - verifyRequest(registration); - return namesystem.getEditLogSize(); - } - - @Override // NamenodeProtocol - public void journal(NamenodeRegistration registration, - int jAction, - int length, - byte[] args) throws IOException { - // Active name-node cannot journal. - throw new UnsupportedActionException("journal"); - } - @Override // ClientProtocol public Token getDelegationToken(Text renewer) throws IOException { @@ -1056,21 +1042,20 @@ public void refreshNodes() throws IOException { namesystem.refreshNodes(new HdfsConfiguration()); } - @Deprecated // NamenodeProtocol - public long getEditLogSize() throws IOException { - return namesystem.getEditLogSize(); + @Override // NamenodeProtocol + public long getTransactionID() { + return namesystem.getTransactionID(); } - @Deprecated @Override // NamenodeProtocol public CheckpointSignature rollEditLog() throws IOException { return namesystem.rollEditLog(); } - - @Deprecated - @Override // NamenodeProtocol - public void rollFsImage(CheckpointSignature sig) throws IOException { - namesystem.rollFSImage(sig); + + @Override + public RemoteEditLogManifest getEditLogManifest(long sinceTxId) + throws IOException { + return namesystem.getEditLogManifest(sinceTxId); } @Override // ClientProtocol @@ -1279,26 +1264,11 @@ public void verifyVersion(int version) throws IOException { if (version != LAYOUT_VERSION) throw new IncorrectVersionException(version, "data node"); } - - /** - * Returns the name of the fsImage file - */ - public File getFsImageName() throws IOException { - return getFSImage().getStorage().getFsImageName(); - } public FSImage getFSImage() { return namesystem.dir.fsImage; } - /** - * Returns the name of the fsImage file uploaded by periodic - * checkpointing - */ - public File[] getFsImageNameCheckpoint() throws IOException { - return getFSImage().getStorage().getFsImageNameCheckpoint(); - } - /** * Returns the address on which the NameNodes is listening to. * @return namenode rpc address @@ -1374,20 +1344,16 @@ private static boolean format(Configuration conf, } System.out.println("Formatting using clusterid: " + clusterId); - FSImage fsImage = new FSImage(dirsToFormat, editDirsToFormat); + FSImage fsImage = new FSImage(conf, null, dirsToFormat, editDirsToFormat); FSNamesystem nsys = new FSNamesystem(fsImage, conf); - nsys.dir.fsImage.getStorage().format(clusterId); + nsys.dir.fsImage.format(clusterId); return false; } private static boolean finalize(Configuration conf, boolean isConfirmationNeeded ) throws IOException { - Collection dirsToFormat = FSNamesystem.getNamespaceDirs(conf); - Collection editDirsToFormat = - FSNamesystem.getNamespaceEditsDirs(conf); - FSNamesystem nsys = new FSNamesystem(new FSImage(dirsToFormat, - editDirsToFormat), conf); + FSNamesystem nsys = new FSNamesystem(new FSImage(conf), conf); System.err.print( "\"finalize\" will remove the previous state of the files system.\n" + "Recent upgrade will become permanent.\n" diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java index ef3fd456267..f126f17eeb8 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java @@ -23,11 +23,19 @@ import java.net.URI; import java.security.PrivilegedAction; import java.security.PrivilegedExceptionAction; -import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.Iterator; +import java.util.List; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.PosixParser; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -43,10 +51,11 @@ import org.apache.hadoop.hdfs.server.common.JspHelper; import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory; import org.apache.hadoop.hdfs.server.common.Storage.StorageState; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; -import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest; import org.apache.hadoop.http.HttpServer; +import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.ipc.RPC; import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; @@ -60,6 +69,9 @@ import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.StringUtils; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableList; + /********************************************************** * The Secondary NameNode is a helper to the primary NameNode. * The Secondary is responsible for supporting periodic checkpoints @@ -98,11 +110,18 @@ public class SecondaryNameNode implements Runnable { private int imagePort; private String infoBindAddress; - private FSNamesystem namesystem; private Collection checkpointDirs; private Collection checkpointEditsDirs; + + /** How often to checkpoint regardless of number of txns */ private long checkpointPeriod; // in seconds - private long checkpointSize; // size (in bytes) of current Edit Log + + /** How often to poll the NN to check checkpointTxnCount */ + private long checkpointCheckPeriod; // in seconds + + /** checkpoint once every this many transactions, regardless of time */ + private long checkpointTxnCount; + /** {@inheritDoc} */ public String toString() { @@ -111,23 +130,49 @@ public String toString() { + "\nStart Time : " + new Date(starttime) + "\nLast Checkpoint Time : " + (lastCheckpointTime == 0? "--": new Date(lastCheckpointTime)) + "\nCheckpoint Period : " + checkpointPeriod + " seconds" - + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointSize) - + " (= " + checkpointSize + " bytes)" + + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointTxnCount) + + " (= " + checkpointTxnCount + " bytes)" + "\nCheckpoint Dirs : " + checkpointDirs + "\nCheckpoint Edits Dirs: " + checkpointEditsDirs; } + @VisibleForTesting FSImage getFSImage() { return checkpointImage; } + + @VisibleForTesting + void setFSImage(CheckpointStorage image) { + this.checkpointImage = image; + } + + @VisibleForTesting + NamenodeProtocol getNameNode() { + return namenode; + } + + @VisibleForTesting + void setNameNode(NamenodeProtocol namenode) { + this.namenode = namenode; + } + @VisibleForTesting + List getCheckpointDirs() { + return ImmutableList.copyOf(checkpointDirs); + } + /** * Create a connection to the primary namenode. */ public SecondaryNameNode(Configuration conf) throws IOException { + this(conf, new CommandLineOpts()); + } + + public SecondaryNameNode(Configuration conf, + CommandLineOpts commandLineOpts) throws IOException { try { NameNode.initializeGenericKeys(conf); - initialize(conf); + initialize(conf, commandLineOpts); } catch(IOException e) { shutdown(); LOG.fatal("Failed to start secondary namenode. ", e); @@ -143,8 +188,10 @@ public static InetSocketAddress getHttpAddress(Configuration conf) { /** * Initialize SecondaryNameNode. + * @param commandLineOpts */ - private void initialize(final Configuration conf) throws IOException { + private void initialize(final Configuration conf, + CommandLineOpts commandLineOpts) throws IOException { final InetSocketAddress infoSocAddr = getHttpAddress(conf); infoBindAddress = infoSocAddr.getHostName(); UserGroupInformation.setConfiguration(conf); @@ -171,14 +218,19 @@ private void initialize(final Configuration conf) throws IOException { "/tmp/hadoop/dfs/namesecondary"); checkpointEditsDirs = FSImage.getCheckpointEditsDirs(conf, "/tmp/hadoop/dfs/namesecondary"); - checkpointImage = new CheckpointStorage(conf); - checkpointImage.recoverCreate(checkpointDirs, checkpointEditsDirs); + checkpointImage = new CheckpointStorage(conf, checkpointDirs, checkpointEditsDirs); + checkpointImage.recoverCreate(commandLineOpts.shouldFormat()); // Initialize other scheduling parameters from the configuration + checkpointCheckPeriod = conf.getLong( + DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, + DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT); + checkpointPeriod = conf.getLong(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT); - checkpointSize = conf.getLong(DFS_NAMENODE_CHECKPOINT_SIZE_KEY, - DFS_NAMENODE_CHECKPOINT_SIZE_DEFAULT); + checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY, + DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT); + warnForDeprecatedConfigs(conf); // initialize the webserver for uploading files. // Kerberized SSL servers must be run from the host principal... @@ -204,8 +256,8 @@ public HttpServer run() throws IOException, InterruptedException { System.setProperty("https.cipherSuites", Krb5AndCertsSslSocketConnector.KRB5_CIPHER_SUITES.get(0)); InetSocketAddress secInfoSocAddr = - NetUtils.createSocketAddr(infoBindAddress + ":"+ conf.get( - "dfs.secondary.https.port", infoBindAddress + ":" + 0)); + NetUtils.createSocketAddr(infoBindAddress + ":"+ conf.getInt( + "dfs.secondary.https.port", 443)); imagePort = secInfoSocAddr.getPort(); infoServer.addSslListener(secInfoSocAddr, conf, false, true); } @@ -227,15 +279,28 @@ public HttpServer run() throws IOException, InterruptedException { // The web-server port can be ephemeral... ensure we have the correct info infoPort = infoServer.getPort(); - if(!UserGroupInformation.isSecurityEnabled()) + if (!UserGroupInformation.isSecurityEnabled()) { imagePort = infoPort; + } + conf.set(DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, infoBindAddress + ":" +infoPort); LOG.info("Secondary Web-server up at: " + infoBindAddress + ":" +infoPort); LOG.info("Secondary image servlet up at: " + infoBindAddress + ":" + imagePort); - LOG.warn("Checkpoint Period :" + checkpointPeriod + " secs " + + LOG.info("Checkpoint Period :" + checkpointPeriod + " secs " + "(" + checkpointPeriod/60 + " min)"); - LOG.warn("Log Size Trigger :" + checkpointSize + " bytes " + - "(" + checkpointSize/1024 + " KB)"); + LOG.info("Log Size Trigger :" + checkpointTxnCount + " txns"); + } + + static void warnForDeprecatedConfigs(Configuration conf) { + for (String key : ImmutableList.of( + "fs.checkpoint.size", + "dfs.namenode.checkpoint.size")) { + if (conf.get(key) != null) { + LOG.warn("Configuration key " + key + " is deprecated! Ignoring..." + + " Instead please specify a value for " + + DFS_NAMENODE_CHECKPOINT_TXNS_KEY); + } + } } /** @@ -283,13 +348,10 @@ public Object run() { public void doWork() { // - // Poll the Namenode (once every 5 minutes) to find the size of the - // pending edit log. + // Poll the Namenode (once every checkpointCheckPeriod seconds) to find the + // number of transactions in the edit log that haven't yet been checkpointed. // - long period = 5 * 60; // 5 minutes - if (checkpointPeriod < period) { - period = checkpointPeriod; - } + long period = Math.min(checkpointCheckPeriod, checkpointPeriod); while (shouldRun) { try { @@ -307,8 +369,7 @@ public void doWork() { long now = System.currentTimeMillis(); - long size = namenode.getEditLogSize(); - if (size >= checkpointSize || + if (shouldCheckpointBasedOnCount() || now >= lastCheckpointTime + 1000 * checkpointPeriod) { doCheckpoint(); lastCheckpointTime = now; @@ -316,7 +377,6 @@ public void doWork() { } catch (IOException e) { LOG.error("Exception in doCheckpoint", e); e.printStackTrace(); - checkpointImage.getStorage().imageDigest = null; } catch (Throwable e) { LOG.error("Throwable Exception in doCheckpoint", e); e.printStackTrace(); @@ -331,49 +391,53 @@ public void doWork() { * @return true if a new image has been downloaded and needs to be loaded * @throws IOException */ - private boolean downloadCheckpointFiles(final CheckpointSignature sig - ) throws IOException { + static boolean downloadCheckpointFiles( + final String nnHostPort, + final FSImage dstImage, + final CheckpointSignature sig, + final RemoteEditLogManifest manifest + ) throws IOException { + + // Sanity check manifest - these could happen if, eg, someone on the + // NN side accidentally rmed the storage directories + if (manifest.getLogs().isEmpty()) { + throw new IOException("Found no edit logs to download on NN since txid " + + sig.mostRecentCheckpointTxId); + } + + long expectedTxId = sig.mostRecentCheckpointTxId + 1; + if (manifest.getLogs().get(0).getStartTxId() != expectedTxId) { + throw new IOException("Bad edit log manifest (expected txid = " + + expectedTxId + ": " + manifest); + } + try { Boolean b = UserGroupInformation.getCurrentUser().doAs( new PrivilegedExceptionAction() { @Override public Boolean run() throws Exception { - checkpointImage.getStorage().cTime = sig.cTime; - checkpointImage.getStorage().setCheckpointTime(sig.checkpointTime); + dstImage.getStorage().cTime = sig.cTime; // get fsimage - String fileid; - Collection list; - File[] srcNames; boolean downloadImage = true; - if (sig.imageDigest.equals( - checkpointImage.getStorage().imageDigest)) { + if (sig.mostRecentCheckpointTxId == + dstImage.getStorage().getMostRecentCheckpointTxId()) { downloadImage = false; LOG.info("Image has not changed. Will not download image."); } else { - fileid = "getimage=1"; - list = checkpointImage.getStorage().getFiles( - NameNodeFile.IMAGE, NameNodeDirType.IMAGE); - srcNames = list.toArray(new File[list.size()]); - assert srcNames.length > 0 : "No checkpoint targets."; - TransferFsImage.getFileClient(fsName, fileid, srcNames, false); - checkpointImage.getStorage().imageDigest = sig.imageDigest; - LOG.info("Downloaded file " + srcNames[0].getName() + " size " + - srcNames[0].length() + " bytes."); + MD5Hash downloadedHash = TransferFsImage.downloadImageToStorage( + nnHostPort, sig.mostRecentCheckpointTxId, dstImage.getStorage(), true); + dstImage.saveDigestAndRenameCheckpointImage( + sig.mostRecentCheckpointTxId, downloadedHash); } // get edits file - fileid = "getedit=1"; - list = getFSImage().getStorage().getFiles( - NameNodeFile.EDITS, NameNodeDirType.EDITS); - srcNames = list.toArray(new File[list.size()]);; - assert srcNames.length > 0 : "No checkpoint targets."; - TransferFsImage.getFileClient(fsName, fileid, srcNames, false); - LOG.info("Downloaded file " + srcNames[0].getName() + " size " + - srcNames[0].length() + " bytes."); + for (RemoteEditLog log : manifest.getLogs()) { + TransferFsImage.downloadEditsToStorage( + nnHostPort, log, dstImage.getStorage()); + } - checkpointImage.checkpointUploadDone(); return Boolean.valueOf(downloadImage); } }); @@ -387,18 +451,6 @@ InetSocketAddress getNameNodeAddress() { return nameNodeAddr; } - /** - * Copy the new fsimage into the NameNode - */ - private void putFSImage(CheckpointSignature sig) throws IOException { - String fileid = "putimage=1&port=" + imagePort + - "&machine=" + infoBindAddress + - "&token=" + sig.toString() + - "&newChecksum=" + checkpointImage.getStorage().getImageDigest(); - LOG.info("Posted URL " + fsName + fileid); - TransferFsImage.getFileClient(fsName, fileid, (File[])null, false); - } - /** * Returns the Jetty server that the Namenode is listening on. */ @@ -423,19 +475,39 @@ private String getInfoServer() throws IOException { return configuredAddress; } } + + /** + * Return the host:port of where this SecondaryNameNode is listening + * for image transfers + */ + private InetSocketAddress getImageListenAddress() { + return new InetSocketAddress(infoBindAddress, imagePort); + } /** * Create a new checkpoint * @return if the image is fetched from primary or not */ boolean doCheckpoint() throws IOException { - - // Do the required initialization of the merge work area. - startCheckpoint(); - + checkpointImage.ensureCurrentDirExists(); + NNStorage dstStorage = checkpointImage.getStorage(); + // Tell the namenode to start logging transactions in a new edit file // Returns a token that would be used to upload the merged image. CheckpointSignature sig = namenode.rollEditLog(); + + // Make sure we're talking to the same NN! + if (checkpointImage.getNamespaceID() != 0) { + // If the image actually has some data, make sure we're talking + // to the same NN as we did before. + sig.validateStorageInfo(checkpointImage); + } else { + // if we're a fresh 2NN, just take the storage info from the server + // we first talk to. + dstStorage.setStorageInfo(sig); + dstStorage.setClusterID(sig.getClusterID()); + dstStorage.setBlockPoolID(sig.getBlockpoolID()); + } // error simulation code for junit test if (ErrorSimulator.getErrorSimulation(0)) { @@ -443,14 +515,20 @@ boolean doCheckpoint() throws IOException { "after creating edits.new"); } - boolean loadImage = downloadCheckpointFiles(sig); // Fetch fsimage and edits - doMerge(sig, loadImage); // Do the merge - + RemoteEditLogManifest manifest = + namenode.getEditLogManifest(sig.mostRecentCheckpointTxId + 1); + + boolean loadImage = downloadCheckpointFiles( + fsName, checkpointImage, sig, manifest); // Fetch fsimage and edits + doMerge(sig, manifest, loadImage, checkpointImage); + // // Upload the new image into the NameNode. Then tell the Namenode // to make this new uploaded image as the most current image. // - putFSImage(sig); + long txid = checkpointImage.getLastAppliedTxId(); + TransferFsImage.uploadImageFromStorage(fsName, getImageListenAddress(), + dstStorage, txid); // error simulation code for junit test if (ErrorSimulator.getErrorSimulation(1)) { @@ -458,91 +536,53 @@ boolean doCheckpoint() throws IOException { "after uploading new image to NameNode"); } - namenode.rollFsImage(sig); - checkpointImage.endCheckpoint(); - LOG.warn("Checkpoint done. New Image Size: " - + checkpointImage.getStorage().getFsImageName().length()); + + dstStorage.getFsImageName(txid).length()); + + // Since we've successfully checkpointed, we can remove some old + // image files + checkpointImage.purgeOldStorage(); return loadImage; } - - private void startCheckpoint() throws IOException { - checkpointImage.getStorage().unlockAll(); - checkpointImage.getEditLog().close(); - checkpointImage.recoverCreate(checkpointDirs, checkpointEditsDirs); - checkpointImage.startCheckpoint(); - } - - /** - * Merge downloaded image and edits and write the new image into - * current storage directory. - */ - private void doMerge(CheckpointSignature sig, boolean loadImage) - throws IOException { - if (loadImage) { - namesystem = new FSNamesystem(checkpointImage, conf); - } - assert namesystem.dir.fsImage == checkpointImage; - checkpointImage.doMerge(sig, loadImage); - } - + + /** * @param argv The parameters passed to this program. * @exception Exception if the filesystem does not exist. * @return 0 on success, non zero on error. */ - private int processArgs(String[] argv) throws Exception { - - if (argv.length < 1) { - printUsage(""); - return -1; + private int processStartupCommand(CommandLineOpts opts) throws Exception { + if (opts.getCommand() == null) { + return 0; } - - int exitCode = -1; - int i = 0; - String cmd = argv[i++]; - - // - // verify that we have enough command line parameters - // - if ("-geteditsize".equals(cmd)) { - if (argv.length != 1) { - printUsage(cmd); - return exitCode; - } - } else if ("-checkpoint".equals(cmd)) { - if (argv.length != 1 && argv.length != 2) { - printUsage(cmd); - return exitCode; - } - if (argv.length == 2 && !"force".equals(argv[i])) { - printUsage(cmd); - return exitCode; - } - } - - exitCode = 0; + + String cmd = opts.getCommand().toString().toLowerCase(); + + int exitCode = 0; try { - if ("-checkpoint".equals(cmd)) { - long size = namenode.getEditLogSize(); - if (size >= checkpointSize || - argv.length == 2 && "force".equals(argv[i])) { + switch (opts.getCommand()) { + case CHECKPOINT: + long count = countUncheckpointedTxns(); + if (count > checkpointTxnCount || + opts.shouldForceCheckpoint()) { doCheckpoint(); } else { - System.err.println("EditLog size " + size + " bytes is " + + System.err.println("EditLog size " + count + " transactions is " + "smaller than configured checkpoint " + - "size " + checkpointSize + " bytes."); + "interval " + checkpointTxnCount + " transactions."); System.err.println("Skipping checkpoint."); } - } else if ("-geteditsize".equals(cmd)) { - long size = namenode.getEditLogSize(); - System.out.println("EditLog size is " + size + " bytes"); - } else { - exitCode = -1; - LOG.error(cmd.substring(1) + ": Unknown command"); - printUsage(""); + break; + case GETEDITSIZE: + long uncheckpointed = countUncheckpointedTxns(); + System.out.println("NameNode has " + uncheckpointed + + " uncheckpointed transactions"); + break; + default: + throw new AssertionError("bad command enum: " + opts.getCommand()); } + } catch (RemoteException e) { // // This is a error returned by hadoop server. Print @@ -551,41 +591,32 @@ private int processArgs(String[] argv) throws Exception { try { String[] content; content = e.getLocalizedMessage().split("\n"); - LOG.error(cmd.substring(1) + ": " - + content[0]); + LOG.error(cmd + ": " + content[0]); } catch (Exception ex) { - LOG.error(cmd.substring(1) + ": " - + ex.getLocalizedMessage()); + LOG.error(cmd + ": " + ex.getLocalizedMessage()); } } catch (IOException e) { // // IO exception encountered locally. // exitCode = -1; - LOG.error(cmd.substring(1) + ": " - + e.getLocalizedMessage()); + LOG.error(cmd + ": " + e.getLocalizedMessage()); } finally { // Does the RPC connection need to be closed? } return exitCode; } - /** - * Displays format of commands. - * @param cmd The command that is being executed. - */ - private void printUsage(String cmd) { - if ("-geteditsize".equals(cmd)) { - System.err.println("Usage: java SecondaryNameNode" - + " [-geteditsize]"); - } else if ("-checkpoint".equals(cmd)) { - System.err.println("Usage: java SecondaryNameNode" - + " [-checkpoint [force]]"); - } else { - System.err.println("Usage: java SecondaryNameNode " + - "[-checkpoint [force]] " + - "[-geteditsize] "); - } + private long countUncheckpointedTxns() throws IOException { + long curTxId = namenode.getTransactionID(); + long uncheckpointedTxns = curTxId - + checkpointImage.getStorage().getMostRecentCheckpointTxId(); + assert uncheckpointedTxns >= 0; + return uncheckpointedTxns; + } + + boolean shouldCheckpointBasedOnCount() throws IOException { + return countUncheckpointedTxns() >= checkpointTxnCount; } /** @@ -594,41 +625,151 @@ private void printUsage(String cmd) { * @exception Exception if the filesystem does not exist. */ public static void main(String[] argv) throws Exception { + CommandLineOpts opts = SecondaryNameNode.parseArgs(argv); + if (opts == null) { + System.exit(-1); + } + StringUtils.startupShutdownMessage(SecondaryNameNode.class, argv, LOG); Configuration tconf = new HdfsConfiguration(); - if (argv.length >= 1) { - SecondaryNameNode secondary = new SecondaryNameNode(tconf); - int ret = secondary.processArgs(argv); + SecondaryNameNode secondary = new SecondaryNameNode(tconf, opts); + + if (opts.getCommand() != null) { + int ret = secondary.processStartupCommand(opts); System.exit(ret); } // Create a never ending deamon - Daemon checkpointThread = new Daemon(new SecondaryNameNode(tconf)); + Daemon checkpointThread = new Daemon(secondary); checkpointThread.start(); } + + + /** + * Container for parsed command-line options. + */ + @SuppressWarnings("static-access") + static class CommandLineOpts { + private final Options options = new Options(); + + private final Option geteditsizeOpt; + private final Option checkpointOpt; + private final Option formatOpt; + + Command cmd; + enum Command { + GETEDITSIZE, + CHECKPOINT; + } + + private boolean shouldForce; + private boolean shouldFormat; + + CommandLineOpts() { + geteditsizeOpt = new Option("geteditsize", + "return the number of uncheckpointed transactions on the NameNode"); + checkpointOpt = OptionBuilder.withArgName("force") + .hasOptionalArg().withDescription("checkpoint on startup").create("checkpoint");; + formatOpt = new Option("format", "format the local storage during startup"); + + options.addOption(geteditsizeOpt); + options.addOption(checkpointOpt); + options.addOption(formatOpt); + } + + public boolean shouldFormat() { + return shouldFormat; + } + + public void parse(String ... argv) throws ParseException { + CommandLineParser parser = new PosixParser(); + CommandLine cmdLine = parser.parse(options, argv); + + boolean hasGetEdit = cmdLine.hasOption(geteditsizeOpt.getOpt()); + boolean hasCheckpoint = cmdLine.hasOption(checkpointOpt.getOpt()); + if (hasGetEdit && hasCheckpoint) { + throw new ParseException("May not pass both " + + geteditsizeOpt.getOpt() + " and " + + checkpointOpt.getOpt()); + } + + if (hasGetEdit) { + cmd = Command.GETEDITSIZE; + } else if (hasCheckpoint) { + cmd = Command.CHECKPOINT; + + String arg = cmdLine.getOptionValue(checkpointOpt.getOpt()); + if ("force".equals(arg)) { + shouldForce = true; + } else if (arg != null) { + throw new ParseException("-checkpoint may only take 'force' as an " + + "argument"); + } + } + + if (cmdLine.hasOption(formatOpt.getOpt())) { + shouldFormat = true; + } + } + + public Command getCommand() { + return cmd; + } + + public boolean shouldForceCheckpoint() { + return shouldForce; + } + + void usage() { + HelpFormatter formatter = new HelpFormatter(); + formatter.printHelp("secondarynamenode", options); + } + } + + private static CommandLineOpts parseArgs(String[] argv) { + CommandLineOpts opts = new CommandLineOpts(); + try { + opts.parse(argv); + } catch (ParseException pe) { + LOG.error(pe.getMessage()); + opts.usage(); + return null; + } + return opts; + } + static class CheckpointStorage extends FSImage { /** + * Construct a checkpoint image. + * @param conf Node configuration. + * @param imageDirs URIs of storage for image. + * @param editDirs URIs of storage for edit logs. + * @throws IOException If storage cannot be access. */ - CheckpointStorage(Configuration conf) throws IOException { - super(conf); + CheckpointStorage(Configuration conf, + Collection imageDirs, + Collection editsDirs) throws IOException { + super(conf, (FSNamesystem)null, imageDirs, editsDirs); + setFSNamesystem(new FSNamesystem(this, conf)); + + // the 2NN never writes edits -- it only downloads them. So + // we shouldn't have any editLog instance. Setting to null + // makes sure we don't accidentally depend on it. + editLog = null; } /** * Analyze checkpoint directories. * Create directories if they do not exist. - * Recover from an unsuccessful checkpoint is necessary. - * - * @param dataDirs - * @param editsDirs + * Recover from an unsuccessful checkpoint is necessary. + * * @throws IOException */ - void recoverCreate(Collection dataDirs, - Collection editsDirs) throws IOException { - Collection tempDataDirs = new ArrayList(dataDirs); - Collection tempEditsDirs = new ArrayList(editsDirs); - storage.close(); - storage.setStorageDirectories(tempDataDirs, tempEditsDirs); + void recoverCreate(boolean format) throws IOException { + storage.attemptRestoreRemovedStorage(); + storage.unlockAll(); + for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); @@ -643,6 +784,13 @@ void recoverCreate(Collection dataDirs, if(!isAccessible) throw new InconsistentFSStateException(sd.getRoot(), "cannot access checkpoint directory."); + + if (format) { + // Don't confirm, since this is just the secondary namenode. + LOG.info("Formatting storage directory " + sd); + sd.clearDirectory(); + } + StorageState curState; try { curState = sd.analyzeStorage(HdfsConstants.StartupOption.REGULAR, storage); @@ -655,6 +803,11 @@ void recoverCreate(Collection dataDirs, case NOT_FORMATTED: break; // it's ok since initially there is no current and VERSION case NORMAL: + // Read the VERSION file. This verifies that: + // (a) the VERSION file for each of the directories is the same, + // and (b) when we connect to a NN, we can verify that the remote + // node matches the same namespace that we ran on previously. + storage.readProperties(sd); break; default: // recovery is possible sd.doRecover(curState); @@ -665,63 +818,41 @@ void recoverCreate(Collection dataDirs, } } } - + /** - * Prepare directories for a new checkpoint. - *

    - * Rename current to lastcheckpoint.tmp - * and recreate current. - * @throws IOException + * Ensure that the current/ directory exists in all storage + * directories */ - void startCheckpoint() throws IOException { + void ensureCurrentDirExists() throws IOException { for (Iterator it = storage.dirIterator(); it.hasNext();) { StorageDirectory sd = it.next(); - storage.moveCurrent(sd); - } - } - - void endCheckpoint() throws IOException { - for (Iterator it - = storage.dirIterator(); it.hasNext();) { - StorageDirectory sd = it.next(); - storage.moveLastCheckpoint(sd); - } - } - - /** - * Merge image and edits, and verify consistency with the signature. - */ - private void doMerge(CheckpointSignature sig, boolean loadImage) - throws IOException { - getEditLog().open(); - StorageDirectory sdName = null; - StorageDirectory sdEdits = null; - Iterator it = null; - if (loadImage) { - it = getStorage().dirIterator(NameNodeDirType.IMAGE); - if (it.hasNext()) - sdName = it.next(); - if (sdName == null) { - throw new IOException("Could not locate checkpoint fsimage"); + File curDir = sd.getCurrentDir(); + if (!curDir.exists() && !curDir.mkdirs()) { + throw new IOException("Could not create directory " + curDir); } } - it = getStorage().dirIterator(NameNodeDirType.EDITS); - if (it.hasNext()) - sdEdits = it.next(); - if (sdEdits == null) - throw new IOException("Could not locate checkpoint edits"); - if (loadImage) { - // to avoid assert in loadFSImage() - this.getStorage().layoutVersion = -1; - getStorage(); - loadFSImage(NNStorage.getStorageFile(sdName, NameNodeFile.IMAGE)); - } - loadFSEdits(sdEdits); - storage.setClusterID(sig.getClusterID()); - storage.setBlockPoolID(sig.getBlockpoolID()); - sig.validateStorageInfo(this); - saveNamespace(false); } } + + static void doMerge( + CheckpointSignature sig, RemoteEditLogManifest manifest, + boolean loadImage, FSImage dstImage) throws IOException { + NNStorage dstStorage = dstImage.getStorage(); + + dstStorage.setStorageInfo(sig); + if (loadImage) { + File file = dstStorage.findImageFile(sig.mostRecentCheckpointTxId); + if (file == null) { + throw new IOException("Couldn't find image file at txid " + + sig.mostRecentCheckpointTxId + " even though it should have " + + "just been downloaded"); + } + dstImage.reloadFromImageFile(file); + } + + Checkpointer.rollForwardByApplyingLogs(manifest, dstImage); + dstImage.saveFSImageInAllDirs(dstImage.getLastAppliedTxId()); + dstStorage.writeAll(); + } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java index cb92229f46c..302e9743be3 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/TransferFsImage.java @@ -21,19 +21,22 @@ import java.net.*; import java.security.DigestInputStream; import java.security.MessageDigest; -import java.util.Iterator; -import java.util.Map; +import java.util.List; import java.lang.Math; -import javax.servlet.http.HttpServletResponse; -import javax.servlet.http.HttpServletRequest; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.hadoop.security.SecurityUtil; import org.apache.hadoop.hdfs.protocol.FSConstants; +import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType; +import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog; import org.apache.hadoop.hdfs.util.DataTransferThrottler; import org.apache.hadoop.hdfs.DFSUtil.ErrorSimulator; import org.apache.hadoop.io.MD5Hash; import org.apache.hadoop.security.UserGroupInformation; +import com.google.common.collect.Lists; + /** * This class provides fetching a specified file from the NameNode. @@ -41,88 +44,77 @@ class TransferFsImage implements FSConstants { public final static String CONTENT_LENGTH = "Content-Length"; - - private boolean isGetImage; - private boolean isGetEdit; - private boolean isPutImage; - private int remoteport; - private String machineName; - private CheckpointSignature token; - private MD5Hash newChecksum = null; - - /** - * File downloader. - * @param pmap key=value[] map that is passed to the http servlet as - * url parameters - * @param request the object from which this servelet reads the url contents - * @param response the object into which this servelet writes the url contents - * @throws IOException - */ - public TransferFsImage(Map pmap, - HttpServletRequest request, - HttpServletResponse response - ) throws IOException { - isGetImage = isGetEdit = isPutImage = false; - remoteport = 0; - machineName = null; - token = null; + public final static String MD5_HEADER = "X-MD5-Digest"; - for (Iterator it = pmap.keySet().iterator(); it.hasNext();) { - String key = it.next(); - if (key.equals("getimage")) { - isGetImage = true; - } else if (key.equals("getedit")) { - isGetEdit = true; - } else if (key.equals("putimage")) { - isPutImage = true; - } else if (key.equals("port")) { - remoteport = new Integer(pmap.get("port")[0]).intValue(); - } else if (key.equals("machine")) { - machineName = pmap.get("machine")[0]; - } else if (key.equals("token")) { - token = new CheckpointSignature(pmap.get("token")[0]); - } else if (key.equals("newChecksum")) { - newChecksum = new MD5Hash(pmap.get("newChecksum")[0]); + private static final Log LOG = LogFactory.getLog(TransferFsImage.class); + + static MD5Hash downloadImageToStorage( + String fsName, long imageTxId, NNStorage dstStorage, boolean needDigest) + throws IOException { + String fileid = GetImageServlet.getParamStringForImage( + imageTxId, dstStorage); + String fileName = NNStorage.getCheckpointImageFileName(imageTxId); + + List dstFiles = dstStorage.getFiles( + NameNodeDirType.IMAGE, fileName); + if (dstFiles.isEmpty()) { + throw new IOException("No targets in destination storage!"); + } + + MD5Hash hash = getFileClient(fsName, fileid, dstFiles, dstStorage, needDigest); + LOG.info("Downloaded file " + dstFiles.get(0).getName() + " size " + + dstFiles.get(0).length() + " bytes."); + return hash; + } + + static void downloadEditsToStorage(String fsName, RemoteEditLog log, + NNStorage dstStorage) throws IOException { + String fileid = GetImageServlet.getParamStringForLog( + log, dstStorage); + String fileName = NNStorage.getFinalizedEditsFileName( + log.getStartTxId(), log.getEndTxId()); + + List dstFiles = dstStorage.getFiles(NameNodeDirType.EDITS, fileName); + assert !dstFiles.isEmpty() : "No checkpoint targets."; + + for (File f : dstFiles) { + if (f.exists() && f.canRead()) { + LOG.info("Skipping download of remote edit log " + + log + " since it already is stored locally at " + f); + return; + } else { + LOG.debug("Dest file: " + f); } } - int numGets = (isGetImage?1:0) + (isGetEdit?1:0); - if ((numGets > 1) || (numGets == 0) && !isPutImage) { - throw new IOException("Illegal parameters to TransferFsImage"); - } + getFileClient(fsName, fileid, dstFiles, dstStorage, false); + LOG.info("Downloaded file " + dstFiles.get(0).getName() + " size " + + dstFiles.get(0).length() + " bytes."); } - - boolean getEdit() { - return isGetEdit; - } - - boolean getImage() { - return isGetImage; - } - - boolean putImage() { - return isPutImage; - } - - CheckpointSignature getToken() { - return token; - } - + /** - * Get the MD5 digest of the new image - * @return the MD5 digest of the new image + * Requests that the NameNode download an image from this node. + * + * @param fsName the http address for the remote NN + * @param imageListenAddress the host/port where the local node is running an + * HTTPServer hosting GetImageServlet + * @param storage the storage directory to transfer the image from + * @param txid the transaction ID of the image to be uploaded */ - MD5Hash getNewChecksum() { - return newChecksum; - } - - String getInfoServer() throws IOException{ - if (machineName == null || remoteport == 0) { - throw new IOException ("MachineName and port undefined"); - } - return machineName + ":" + remoteport; + static void uploadImageFromStorage(String fsName, + InetSocketAddress imageListenAddress, + NNStorage storage, long txid) throws IOException { + + String fileid = GetImageServlet.getParamStringToPutImage( + txid, imageListenAddress, storage); + // this doesn't directly upload an image, but rather asks the NN + // to connect back to the 2NN to download the specified image. + TransferFsImage.getFileClient(fsName, fileid, null, null, false); + LOG.info("Uploaded image with txid " + txid + " to namenode at " + + fsName); } + /** * A server-side method to respond to a getfile http request * Copies the contents of the local file into the output stream. @@ -156,6 +148,13 @@ static void getFileServer(OutputStream outstream, File localfile, if (num <= 0) { break; } + + if (ErrorSimulator.getErrorSimulation(4)) { + // Simulate a corrupted byte on the wire + LOG.warn("SIMULATING A CORRUPT BYTE IN IMAGE TRANSFER!"); + buf[0]++; + } + outstream.write(buf, 0, num); if (throttler != null) { throttler.throttle(num); @@ -171,16 +170,17 @@ static void getFileServer(OutputStream outstream, File localfile, /** * Client-side Method to fetch file from a server * Copies the response from the URL to a list of local files. - * + * @param dstStorage if an error occurs writing to one of the files, + * this storage object will be notified. * @Return a digest of the received file if getChecksum is true */ - static MD5Hash getFileClient(String fsName, String id, File[] localPath, - boolean getChecksum) - throws IOException { + static MD5Hash getFileClient(String nnHostPort, + String queryString, List localPaths, + NNStorage dstStorage, boolean getChecksum) throws IOException { byte[] buf = new byte[BUFFER_SIZE]; String proto = UserGroupInformation.isSecurityEnabled() ? "https://" : "http://"; - StringBuilder str = new StringBuilder(proto+fsName+"/getimage?"); - str.append(id); + StringBuilder str = new StringBuilder(proto+nnHostPort+"/getimage?"); + str.append(queryString); // // open connection to remote server @@ -189,7 +189,15 @@ static MD5Hash getFileClient(String fsName, String id, File[] localPath, // Avoid Krb bug with cross-realm hosts SecurityUtil.fetchServiceTicket(url); - URLConnection connection = url.openConnection(); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + + if (connection.getResponseCode() != HttpURLConnection.HTTP_OK) { + throw new IOException( + "Image transfer servlet at " + url + + " failed with status code " + connection.getResponseCode() + + "\nResponse message:\n" + connection.getResponseMessage()); + } + long advertisedSize; String contentLength = connection.getHeaderField(CONTENT_LENGTH); if (contentLength != null) { @@ -198,6 +206,9 @@ static MD5Hash getFileClient(String fsName, String id, File[] localPath, throw new IOException(CONTENT_LENGTH + " header is not provided " + "by the namenode when trying to fetch " + str); } + + MD5Hash advertisedDigest = parseMD5Header(connection); + long received = 0; InputStream stream = connection.getInputStream(); MessageDigest digester = null; @@ -205,36 +216,47 @@ static MD5Hash getFileClient(String fsName, String id, File[] localPath, digester = MD5Hash.getDigester(); stream = new DigestInputStream(stream, digester); } - FileOutputStream[] output = null; boolean finishedReceiving = false; + List outputStreams = Lists.newArrayList(); + try { - if (localPath != null) { - output = new FileOutputStream[localPath.length]; - for (int i = 0; i < output.length; i++) { - output[i] = new FileOutputStream(localPath[i]); + if (localPaths != null) { + for (File f : localPaths) { + try { + if (f.exists()) { + LOG.warn("Overwriting existing file " + f + + " with file downloaded from " + str); + } + outputStreams.add(new FileOutputStream(f)); + } catch (IOException ioe) { + LOG.warn("Unable to download file " + f, ioe); + dstStorage.reportErrorOnFile(f); + } + } + + if (outputStreams.isEmpty()) { + throw new IOException( + "Unable to download to any storage directory"); } } + int num = 1; while (num > 0) { num = stream.read(buf); - if (num > 0 && localPath != null) { + if (num > 0) { received += num; - for (int i = 0; i < output.length; i++) { - output[i].write(buf, 0, num); + for (FileOutputStream fos : outputStreams) { + fos.write(buf, 0, num); } } } finishedReceiving = true; } finally { stream.close(); - if (output != null) { - for (int i = 0; i < output.length; i++) { - if (output[i] != null) { - output[i].getChannel().force(true); - output[i].close(); - } - } + for (FileOutputStream fos : outputStreams) { + fos.getChannel().force(true); + fos.close(); } if (finishedReceiving && received != advertisedSize) { // only throw this exception if we think we read all of it on our end @@ -245,6 +267,25 @@ static MD5Hash getFileClient(String fsName, String id, File[] localPath, advertisedSize); } } - return digester==null ? null : new MD5Hash(digester.digest()); + + if (digester != null) { + MD5Hash computedDigest = new MD5Hash(digester.digest()); + + if (advertisedDigest != null && + !computedDigest.equals(advertisedDigest)) { + throw new IOException("File " + str + " computed digest " + + computedDigest + " does not match advertised digest " + + advertisedDigest); + } + return computedDigest; + } else { + return null; + } } + + private static MD5Hash parseMD5Header(HttpURLConnection connection) { + String header = connection.getHeaderField(MD5_HEADER); + return (header != null) ? new MD5Hash(header) : null; + } + } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/CheckpointCommand.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/CheckpointCommand.java index f338aa70420..889d635043f 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/CheckpointCommand.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/CheckpointCommand.java @@ -47,19 +47,16 @@ @InterfaceStability.Evolving public class CheckpointCommand extends NamenodeCommand { private CheckpointSignature cSig; - private boolean isImageObsolete; private boolean needToReturnImage; public CheckpointCommand() { - this(null, false, false); + this(null, false); } public CheckpointCommand(CheckpointSignature sig, - boolean isImgObsolete, boolean needToReturnImg) { super(NamenodeProtocol.ACT_CHECKPOINT); this.cSig = sig; - this.isImageObsolete = isImgObsolete; this.needToReturnImage = needToReturnImg; } @@ -71,16 +68,6 @@ public CheckpointSignature getSignature() { return cSig; } - /** - * Indicates whether current backup image is obsolete, and therefore - * need to be discarded? - * - * @return true if current image should be discarded. - */ - public boolean isImageObsolete() { - return isImageObsolete; - } - /** * Indicates whether the new checkpoint image needs to be transfered * back to the name-node after the checkpoint is done. @@ -104,7 +91,6 @@ public boolean needToReturnImage() { public void write(DataOutput out) throws IOException { super.write(out); cSig.write(out); - out.writeBoolean(isImageObsolete); out.writeBoolean(needToReturnImage); } @@ -112,7 +98,6 @@ public void readFields(DataInput in) throws IOException { super.readFields(in); cSig = new CheckpointSignature(); cSig.readFields(in); - isImageObsolete = in.readBoolean(); needToReturnImage = in.readBoolean(); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/JournalProtocol.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/JournalProtocol.java new file mode 100644 index 00000000000..224208d7a3f --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/JournalProtocol.java @@ -0,0 +1,62 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.protocol; + +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; +import org.apache.hadoop.hdfs.DFSConfigKeys; +import org.apache.hadoop.ipc.VersionedProtocol; +import org.apache.hadoop.security.KerberosInfo; + +/** + * Protocol used to journal edits to a remote node. Currently, + * this is used to publish edits from the NameNode to a BackupNode. + */ +@KerberosInfo( + serverPrincipal = DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, + clientPrincipal = DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY) +@InterfaceAudience.Private +public interface JournalProtocol extends VersionedProtocol { + public static final long versionID = 1L; + + /** + * Journal edit records. + * This message is sent by the active name-node to the backup node + * via {@code EditLogBackupOutputStream} in order to synchronize meta-data + * changes with the backup namespace image. + * + * @param registration active node registration + * @param firstTxnId the first transaction of this batch + * @param numTxns number of transactions + * @param records byte array containing serialized journal records + */ + public void journal(NamenodeRegistration registration, + long firstTxnId, + int numTxns, + byte[] records) throws IOException; + + /** + * Notify the BackupNode that the NameNode has rolled its edit logs + * and is now writing a new log segment. + * @param registration the registration of the active NameNode + * @param txid the first txid in the new log + */ + public void startLogSegment(NamenodeRegistration registration, + long txid) throws IOException; +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java index 7a8124e41ee..a58d0d086e4 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeProtocol.java @@ -42,21 +42,14 @@ public interface NamenodeProtocol extends VersionedProtocol { * (Only the latest change is reflected. * The log of historical changes can be retrieved from the svn). * - * 5: Added one parameter to rollFSImage() and - * changed the definition of CheckpointSignature + * 6: Switch to txid-based file naming for image and edits */ - public static final long versionID = 5L; + public static final long versionID = 6L; // Error codes passed by errorReport(). final static int NOTIFY = 0; final static int FATAL = 1; - // Journal action codes. See journal(). - public static byte JA_IS_ALIVE = 100; // check whether the journal is alive - public static byte JA_JOURNAL = 101; // just journal - public static byte JA_JSPOOL_START = 102; // = FSEditLogOpCodes.OP_JSPOOL_START - public static byte JA_CHECKPOINT_TIME = 103; // = FSEditLogOpCodes.OP_CHECKPOINT_TIME - public final static int ACT_UNKNOWN = 0; // unknown action public final static int ACT_SHUTDOWN = 50; // shutdown node public final static int ACT_CHECKPOINT = 51; // do checkpoint @@ -84,14 +77,11 @@ public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size) public ExportedBlockKeys getBlockKeys() throws IOException; /** - * Get the size of the current edit log (in bytes). - * @return The number of bytes in the current edit log. + * @return The most recent transaction ID that has been synced to + * persistent storage. * @throws IOException - * @deprecated - * See {@link org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode} */ - @Deprecated - public long getEditLogSize() throws IOException; + public long getTransactionID() throws IOException; /** * Closes the current edit log and opens a new one. The @@ -104,20 +94,6 @@ public BlocksWithLocations getBlocks(DatanodeInfo datanode, long size) @Deprecated public CheckpointSignature rollEditLog() throws IOException; - /** - * Rolls the fsImage log. It removes the old fsImage, copies the - * new image to fsImage, removes the old edits and renames edits.new - * to edits. The call fails if any of the four files are missing. - * - * @param sig the signature of this checkpoint (old fsimage) - * @throws IOException - * @deprecated - * See {@link org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode} - */ - @Deprecated - public void rollFsImage(CheckpointSignature sig) - throws IOException; - /** * Request name-node version and storage information. * @@ -177,31 +153,14 @@ public NamenodeCommand startCheckpoint(NamenodeRegistration registration) */ public void endCheckpoint(NamenodeRegistration registration, CheckpointSignature sig) throws IOException; - + + /** - * Get the size of the active name-node journal (edit log) in bytes. - * - * @param registration the requesting node - * @return The number of bytes in the journal. - * @throws IOException + * Return a structure containing details about all edit logs + * available to be fetched from the NameNode. + * @param sinceTxId return only logs that contain transactions >= sinceTxId */ - public long journalSize(NamenodeRegistration registration) throws IOException; - - /** - * Journal edit records. - * This message is sent by the active name-node to the backup node - * via {@code EditLogBackupOutputStream} in order to synchronize meta-data - * changes with the backup namespace image. - * - * @param registration active node registration - * @param jAction journal action - * @param length length of the byte array - * @param records byte array containing serialized journal records - * @throws IOException - */ - public void journal(NamenodeRegistration registration, - int jAction, - int length, - byte[] records) throws IOException; + public RemoteEditLogManifest getEditLogManifest(long sinceTxId) + throws IOException; } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeRegistration.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeRegistration.java index 3abcf98272a..80426605a05 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeRegistration.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamenodeRegistration.java @@ -43,7 +43,6 @@ public class NamenodeRegistration extends StorageInfo String rpcAddress; // RPC address of the node String httpAddress; // HTTP address of the node NamenodeRole role; // node role - long checkpointTime = -1L; // the age of the image public NamenodeRegistration() { super(); @@ -52,14 +51,12 @@ public NamenodeRegistration() { public NamenodeRegistration(String address, String httpAddress, StorageInfo storageInfo, - NamenodeRole role, - long checkpointTime) { + NamenodeRole role) { super(); this.rpcAddress = address; this.httpAddress = httpAddress; this.setStorageInfo(storageInfo); this.role = role; - this.checkpointTime = checkpointTime; } @Override // NodeRegistration @@ -96,13 +93,6 @@ public boolean isRole(NamenodeRole that) { return role.equals(that); } - /** - * Get the age of the image. - */ - public long getCheckpointTime() { - return checkpointTime; - } - ///////////////////////////////////////////////// // Writable ///////////////////////////////////////////////// @@ -120,7 +110,6 @@ public void write(DataOutput out) throws IOException { Text.writeString(out, httpAddress); Text.writeString(out, role.name()); super.write(out); - out.writeLong(checkpointTime); } @Override // Writable @@ -129,6 +118,5 @@ public void readFields(DataInput in) throws IOException { httpAddress = Text.readString(in); role = NamenodeRole.valueOf(Text.readString(in)); super.readFields(in); - checkpointTime = in.readLong(); } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamespaceInfo.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamespaceInfo.java index 133d1959579..e847cfc371a 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamespaceInfo.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/NamespaceInfo.java @@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.protocol.FSConstants; import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.StorageInfo; +import org.apache.hadoop.hdfs.server.namenode.NNStorage; import org.apache.hadoop.hdfs.DeprecatedUTF8; import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.WritableFactories; @@ -98,4 +99,26 @@ public void readFields(DataInput in) throws IOException { public String toString(){ return super.toString() + ";bpid=" + blockPoolID; } + + public void validateStorage(NNStorage storage) throws IOException { + if (layoutVersion != storage.getLayoutVersion() || + namespaceID != storage.getNamespaceID() || + cTime != storage.cTime || + !clusterID.equals(storage.getClusterID()) || + !blockPoolID.equals(storage.getBlockPoolID())) { + throw new IOException("Inconsistent namespace information:\n" + + "NamespaceInfo has:\n" + + "LV=" + layoutVersion + ";" + + "NS=" + namespaceID + ";" + + "cTime=" + cTime + ";" + + "CID=" + clusterID + ";" + + "BPID=" + blockPoolID + + ".\nStorage has:\n" + + "LV=" + storage.getLayoutVersion() + ";" + + "NS=" + storage.getNamespaceID() + ";" + + "cTime=" + storage.getCTime() + ";" + + "CID=" + storage.getClusterID() + ";" + + "BPID=" + storage.getBlockPoolID() + "."); + } + } } diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/RemoteEditLog.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/RemoteEditLog.java new file mode 100644 index 00000000000..5801961264f --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/RemoteEditLog.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.protocol; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +import org.apache.hadoop.hdfs.protocol.FSConstants; +import org.apache.hadoop.io.Writable; + +public class RemoteEditLog implements Writable { + private long startTxId = FSConstants.INVALID_TXID; + private long endTxId = FSConstants.INVALID_TXID; + + public RemoteEditLog() { + } + + public RemoteEditLog(long startTxId, long endTxId) { + this.startTxId = startTxId; + this.endTxId = endTxId; + } + + public long getStartTxId() { + return startTxId; + } + + public long getEndTxId() { + return endTxId; + } + + @Override + public String toString() { + return "[" + startTxId + "," + endTxId + "]"; + } + + @Override + public void write(DataOutput out) throws IOException { + out.writeLong(startTxId); + out.writeLong(endTxId); + } + + @Override + public void readFields(DataInput in) throws IOException { + startTxId = in.readLong(); + endTxId = in.readLong(); + } + +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/RemoteEditLogManifest.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/RemoteEditLogManifest.java new file mode 100644 index 00000000000..f871828f840 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/protocol/RemoteEditLogManifest.java @@ -0,0 +1,99 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.protocol; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +import org.apache.hadoop.io.Writable; + +import com.google.common.base.Joiner; +import com.google.common.base.Preconditions; +import com.google.common.collect.Lists; + +/** + * An enumeration of logs available on a remote NameNode. + */ +public class RemoteEditLogManifest implements Writable { + + private List logs; + + public RemoteEditLogManifest() { + } + + public RemoteEditLogManifest(List logs) { + this.logs = logs; + checkState(); + } + + + /** + * Check that the logs are contiguous and non-overlapping + * sequences of transactions, in sorted order + * @throws IllegalStateException if incorrect + */ + private void checkState() { + Preconditions.checkNotNull(logs); + + RemoteEditLog prev = null; + for (RemoteEditLog log : logs) { + if (prev != null) { + if (log.getStartTxId() != prev.getEndTxId() + 1) { + throw new IllegalStateException("Invalid log manifest:" + this); + } + } + + prev = log; + } + } + + public List getLogs() { + return Collections.unmodifiableList(logs); + } + + + + @Override + public String toString() { + return "[" + Joiner.on(", ").join(logs) + "]"; + } + + + @Override + public void write(DataOutput out) throws IOException { + out.writeInt(logs.size()); + for (RemoteEditLog log : logs) { + log.write(out); + } + } + + @Override + public void readFields(DataInput in) throws IOException { + int numLogs = in.readInt(); + logs = Lists.newArrayList(); + for (int i = 0; i < numLogs; i++) { + RemoteEditLog log = new RemoteEditLog(); + log.readFields(in); + logs.add(log); + } + checkState(); + } +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java index 80e292778ed..a01083065dc 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsElement.java @@ -31,6 +31,7 @@ public enum EditsElement { EDITS_VERSION, RECORD, OPCODE, + TRANSACTION_ID, DATA, // elements in the data part of the editLog records LENGTH, diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java index f25c8ebf0df..98398ab2628 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/EditsLoaderCurrent.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hdfs.tools.offlineEditsViewer; +import java.io.EOFException; import java.io.IOException; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; @@ -40,7 +41,7 @@ class EditsLoaderCurrent implements EditsLoader { private static int[] supportedVersions = { -18, -19, -20, -21, -22, -23, -24, - -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36 }; + -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38}; private EditsVisitor v; private int editsVersion = 0; @@ -64,11 +65,19 @@ public boolean canLoadVersion(int version) { return false; } + /** + * Visit a transaction ID, if the log version supports it. + */ + private void visitTxId() throws IOException { + if (LayoutVersion.supports(Feature.STORED_TXIDS, editsVersion)) { + v.visitLong(EditsElement.TRANSACTION_ID); + } + } + /** * Visit OP_INVALID */ private void visit_OP_INVALID() throws IOException { - ; // nothing to do, this op code has no data } /** @@ -92,6 +101,7 @@ private void visit_OP_CLOSE() throws IOException { */ private void visit_OP_ADD_or_OP_CLOSE(FSEditLogOpCodes editsOpCode) throws IOException { + visitTxId(); IntToken opAddLength = v.visitInt(EditsElement.LENGTH); // this happens if the edits is not properly ended (-1 op code), @@ -135,6 +145,8 @@ private void visit_OP_ADD_or_OP_CLOSE(FSEditLogOpCodes editsOpCode) * Visit OP_RENAME_OLD */ private void visit_OP_RENAME_OLD() throws IOException { + visitTxId(); + v.visitInt( EditsElement.LENGTH); v.visitStringUTF8( EditsElement.SOURCE); v.visitStringUTF8( EditsElement.DESTINATION); @@ -145,6 +157,8 @@ private void visit_OP_RENAME_OLD() throws IOException { * Visit OP_DELETE */ private void visit_OP_DELETE() throws IOException { + visitTxId(); + v.visitInt( EditsElement.LENGTH); v.visitStringUTF8( EditsElement.PATH); v.visitStringUTF8( EditsElement.TIMESTAMP); @@ -154,6 +168,8 @@ private void visit_OP_DELETE() throws IOException { * Visit OP_MKDIR */ private void visit_OP_MKDIR() throws IOException { + visitTxId(); + v.visitInt( EditsElement.LENGTH); v.visitStringUTF8( EditsElement.PATH); v.visitStringUTF8( EditsElement.TIMESTAMP); @@ -172,6 +188,8 @@ private void visit_OP_MKDIR() throws IOException { * Visit OP_SET_REPLICATION */ private void visit_OP_SET_REPLICATION() throws IOException { + visitTxId(); + v.visitStringUTF8(EditsElement.PATH); v.visitStringUTF8(EditsElement.REPLICATION); } @@ -180,6 +198,8 @@ private void visit_OP_SET_REPLICATION() throws IOException { * Visit OP_SET_PERMISSIONS */ private void visit_OP_SET_PERMISSIONS() throws IOException { + visitTxId(); + v.visitStringUTF8( EditsElement.PATH); v.visitShort( EditsElement.FS_PERMISSIONS); } @@ -188,6 +208,8 @@ private void visit_OP_SET_PERMISSIONS() throws IOException { * Visit OP_SET_OWNER */ private void visit_OP_SET_OWNER() throws IOException { + visitTxId(); + v.visitStringUTF8(EditsElement.PATH); v.visitStringUTF8(EditsElement.USERNAME); v.visitStringUTF8(EditsElement.GROUPNAME); @@ -197,6 +219,8 @@ private void visit_OP_SET_OWNER() throws IOException { * Visit OP_SET_GENSTAMP */ private void visit_OP_SET_GENSTAMP() throws IOException { + visitTxId(); + v.visitLong(EditsElement.GENERATION_STAMP); } @@ -204,6 +228,8 @@ private void visit_OP_SET_GENSTAMP() throws IOException { * Visit OP_TIMES */ private void visit_OP_TIMES() throws IOException { + visitTxId(); + v.visitInt( EditsElement.LENGTH); v.visitStringUTF8( EditsElement.PATH); v.visitStringUTF8( EditsElement.MTIME); @@ -214,6 +240,8 @@ private void visit_OP_TIMES() throws IOException { * Visit OP_SET_QUOTA */ private void visit_OP_SET_QUOTA() throws IOException { + visitTxId(); + v.visitStringUTF8( EditsElement.PATH); v.visitLong( EditsElement.NS_QUOTA); v.visitLong( EditsElement.DS_QUOTA); @@ -223,6 +251,8 @@ private void visit_OP_SET_QUOTA() throws IOException { * Visit OP_RENAME */ private void visit_OP_RENAME() throws IOException { + visitTxId(); + v.visitInt( EditsElement.LENGTH); v.visitStringUTF8( EditsElement.SOURCE); v.visitStringUTF8( EditsElement.DESTINATION); @@ -234,6 +264,8 @@ private void visit_OP_RENAME() throws IOException { * Visit OP_CONCAT_DELETE */ private void visit_OP_CONCAT_DELETE() throws IOException { + visitTxId(); + IntToken lengthToken = v.visitInt(EditsElement.LENGTH); v.visitStringUTF8(EditsElement.CONCAT_TARGET); // all except of CONCAT_TARGET and TIMESTAMP @@ -248,6 +280,8 @@ private void visit_OP_CONCAT_DELETE() throws IOException { * Visit OP_SYMLINK */ private void visit_OP_SYMLINK() throws IOException { + visitTxId(); + v.visitInt( EditsElement.LENGTH); v.visitStringUTF8( EditsElement.SOURCE); v.visitStringUTF8( EditsElement.DESTINATION); @@ -267,6 +301,8 @@ private void visit_OP_SYMLINK() throws IOException { * Visit OP_GET_DELEGATION_TOKEN */ private void visit_OP_GET_DELEGATION_TOKEN() throws IOException { + visitTxId(); + v.visitByte( EditsElement.T_VERSION); v.visitStringText( EditsElement.T_OWNER); v.visitStringText( EditsElement.T_RENEWER); @@ -283,6 +319,8 @@ private void visit_OP_GET_DELEGATION_TOKEN() throws IOException { */ private void visit_OP_RENEW_DELEGATION_TOKEN() throws IOException { + visitTxId(); + v.visitByte( EditsElement.T_VERSION); v.visitStringText( EditsElement.T_OWNER); v.visitStringText( EditsElement.T_RENEWER); @@ -299,6 +337,8 @@ private void visit_OP_RENEW_DELEGATION_TOKEN() */ private void visit_OP_CANCEL_DELEGATION_TOKEN() throws IOException { + visitTxId(); + v.visitByte( EditsElement.T_VERSION); v.visitStringText( EditsElement.T_OWNER); v.visitStringText( EditsElement.T_RENEWER); @@ -314,6 +354,8 @@ private void visit_OP_CANCEL_DELEGATION_TOKEN() */ private void visit_OP_UPDATE_MASTER_KEY() throws IOException { + visitTxId(); + v.visitVInt( EditsElement.KEY_ID); v.visitVLong( EditsElement.KEY_EXPIRY_DATE); VIntToken blobLengthToken = v.visitVInt(EditsElement.KEY_LENGTH); @@ -322,11 +364,29 @@ private void visit_OP_UPDATE_MASTER_KEY() private void visit_OP_REASSIGN_LEASE() throws IOException { + visitTxId(); + v.visitStringUTF8(EditsElement.CLIENT_NAME); v.visitStringUTF8(EditsElement.PATH); v.visitStringUTF8(EditsElement.CLIENT_NAME); } + /** + * Visit OP_BEGIN_LOG_SEGMENT + */ + private void visit_OP_BEGIN_LOG_SEGMENT() + throws IOException { + visitTxId(); + } + + /** + * Visit OP_END_LOG_SEGMENT + */ + private void visit_OP_END_LOG_SEGMENT() + throws IOException { + visitTxId(); + } + private void visitOpCode(FSEditLogOpCodes editsOpCode) throws IOException { @@ -391,6 +451,12 @@ private void visitOpCode(FSEditLogOpCodes editsOpCode) case OP_REASSIGN_LEASE: // 22 visit_OP_REASSIGN_LEASE(); break; + case OP_END_LOG_SEGMENT: // 23 + visit_OP_END_LOG_SEGMENT(); + break; + case OP_START_LOG_SEGMENT: // 24 + visit_OP_BEGIN_LOG_SEGMENT(); + break; default: { throw new IOException("Unknown op code " + editsOpCode); @@ -419,7 +485,17 @@ public void loadEdits() throws IOException { do { v.visitEnclosingElement(EditsElement.RECORD); - ByteToken opCodeToken = v.visitByte(EditsElement.OPCODE); + ByteToken opCodeToken; + try { + opCodeToken = v.visitByte(EditsElement.OPCODE); + } catch (EOFException eof) { + // Getting EOF when reading the opcode is fine -- + // it's just a finalized edits file + // Just fake the OP_INVALID here. + opCodeToken = new ByteToken(EditsElement.OPCODE); + opCodeToken.fromByte(FSEditLogOpCodes.OP_INVALID.getOpCode()); + v.visit(opCodeToken); + } editsOpCode = FSEditLogOpCodes.fromByte(opCodeToken.value); v.visitEnclosingElement(EditsElement.DATA); diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/OfflineEditsViewer.java b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/OfflineEditsViewer.java index b92c5fc84e4..b4dfb8b3c38 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/OfflineEditsViewer.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/OfflineEditsViewer.java @@ -17,18 +17,12 @@ */ package org.apache.hadoop.hdfs.tools.offlineEditsViewer; -import java.io.BufferedInputStream; -import java.io.DataInputStream; import java.io.EOFException; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.util.Map; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/Tokenizer.java b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/Tokenizer.java index 232d7608b64..8b68fb4b973 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/Tokenizer.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineEditsViewer/Tokenizer.java @@ -104,6 +104,10 @@ public void fromString(String s) throws IOException { public void fromBinary(DataInputStream in) throws IOException { value = in.readByte(); } + + public void fromByte(byte b) { + value = b; + } @Override public String toString() { diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java index dee1f4f4411..0a248823098 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageLoaderCurrent.java @@ -122,7 +122,7 @@ class ImageLoaderCurrent implements ImageLoader { protected final DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm"); private static int[] versions = { -16, -17, -18, -19, -20, -21, -22, -23, - -24, -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36 }; + -24, -25, -26, -27, -28, -30, -31, -32, -33, -34, -35, -36, -37, -38}; private int imageVersion = 0; /* (non-Javadoc) @@ -157,6 +157,10 @@ public void loadImage(DataInputStream in, ImageVisitor v, v.visit(ImageElement.GENERATION_STAMP, in.readLong()); + if (LayoutVersion.supports(Feature.STORED_TXIDS, imageVersion)) { + v.visit(ImageElement.TRANSACTION_ID, in.readLong()); + } + if (LayoutVersion.supports(Feature.FSIMAGE_COMPRESSION, imageVersion)) { boolean isCompressed = in.readBoolean(); v.visit(ImageElement.IS_COMPRESSED, imageVersion); diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageVisitor.java b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageVisitor.java index 6c500caded7..9f617ba7bce 100644 --- a/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageVisitor.java +++ b/hdfs/src/java/org/apache/hadoop/hdfs/tools/offlineImageViewer/ImageVisitor.java @@ -71,7 +71,8 @@ public enum ImageElement { NUM_DELEGATION_TOKENS, DELEGATION_TOKENS, DELEGATION_TOKEN_IDENTIFIER, - DELEGATION_TOKEN_EXPIRY_TIME + DELEGATION_TOKEN_EXPIRY_TIME, + TRANSACTION_ID } /** diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/util/AtomicFileOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/util/AtomicFileOutputStream.java new file mode 100644 index 00000000000..9ac4861f384 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/util/AtomicFileOutputStream.java @@ -0,0 +1,94 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.util; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FilterOutputStream; +import java.io.IOException; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.IOUtils; + +/** + * A FileOutputStream that has the property that it will only show + * up at its destination once it has been entirely written and flushed + * to disk. While being written, it will use a .tmp suffix. + * + * When the output stream is closed, it is flushed, fsynced, and + * will be moved into place, overwriting any file that already + * exists at that location. + * + * NOTE: on Windows platforms, it will not atomically + * replace the target file - instead the target file is deleted + * before this one is moved into place. + */ +public class AtomicFileOutputStream extends FilterOutputStream { + + private static final String TMP_EXTENSION = ".tmp"; + + private final static Log LOG = LogFactory.getLog( + AtomicFileOutputStream.class); + + private final File origFile; + private final File tmpFile; + + public AtomicFileOutputStream(File f) throws FileNotFoundException { + // Code unfortunately must be duplicated below since we can't assign anything + // before calling super + super(new FileOutputStream(new File(f.getParentFile(), f.getName() + TMP_EXTENSION))); + origFile = f.getAbsoluteFile(); + tmpFile = new File(f.getParentFile(), f.getName() + TMP_EXTENSION).getAbsoluteFile(); + } + + @Override + public void close() throws IOException { + boolean triedToClose = false, success = false; + try { + flush(); + ((FileOutputStream)out).getChannel().force(true); + + triedToClose = true; + super.close(); + success = true; + } finally { + if (success) { + boolean renamed = tmpFile.renameTo(origFile); + if (!renamed) { + // On windows, renameTo does not replace. + if (!origFile.delete() || !tmpFile.renameTo(origFile)) { + throw new IOException("Could not rename temporary file " + + tmpFile + " to " + origFile); + } + } + } else { + if (!triedToClose) { + // If we failed when flushing, try to close it to not leak an FD + IOUtils.closeStream(out); + } + // close wasn't successful, try to delete the tmp file + if (!tmpFile.delete()) { + LOG.warn("Unable to delete tmp file " + tmpFile); + } + } + } + } + +} diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/util/MD5FileUtils.java b/hdfs/src/java/org/apache/hadoop/hdfs/util/MD5FileUtils.java new file mode 100644 index 00000000000..0d691378ba2 --- /dev/null +++ b/hdfs/src/java/org/apache/hadoop/hdfs/util/MD5FileUtils.java @@ -0,0 +1,153 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.util; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.IOUtils; +import org.apache.hadoop.io.MD5Hash; +import org.apache.hadoop.util.StringUtils; + +/** + * Static functions for dealing with files of the same format + * that the Unix "md5sum" utility writes. + */ +public abstract class MD5FileUtils { + private static final Log LOG = LogFactory.getLog( + MD5FileUtils.class); + + private static final String MD5_SUFFIX = ".md5"; + private static final Pattern LINE_REGEX = + Pattern.compile("([0-9a-f]{32}) [ \\*](.+)"); + + /** + * Verify that the previously saved md5 for the given file matches + * expectedMd5. + * @throws IOException + */ + public static void verifySavedMD5(File dataFile, MD5Hash expectedMD5) + throws IOException { + MD5Hash storedHash = readStoredMd5ForFile(dataFile); + // Check the hash itself + if (!expectedMD5.equals(storedHash)) { + throw new IOException( + "File " + dataFile + " did not match stored MD5 checksum " + + " (stored: " + storedHash + ", computed: " + expectedMD5); + } + } + + /** + * Read the md5 checksum stored alongside the given file, or null + * if no md5 is stored. + * @param dataFile the file containing data + * @return the checksum stored in dataFile.md5 + */ + public static MD5Hash readStoredMd5ForFile(File dataFile) throws IOException { + File md5File = getDigestFileForFile(dataFile); + + String md5Line; + + if (!md5File.exists()) { + return null; + } + + BufferedReader reader = + new BufferedReader(new FileReader(md5File)); + try { + md5Line = reader.readLine(); + if (md5Line == null) { md5Line = ""; } + md5Line = md5Line.trim(); + } catch (IOException ioe) { + throw new IOException("Error reading md5 file at " + md5File, ioe); + } finally { + IOUtils.cleanup(LOG, reader); + } + + Matcher matcher = LINE_REGEX.matcher(md5Line); + if (!matcher.matches()) { + throw new IOException("Invalid MD5 file at " + md5File + + " (does not match expected pattern)"); + } + String storedHash = matcher.group(1); + File referencedFile = new File(matcher.group(2)); + + // Sanity check: Make sure that the file referenced in the .md5 file at + // least has the same name as the file we expect + if (!referencedFile.getName().equals(dataFile.getName())) { + throw new IOException( + "MD5 file at " + md5File + " references file named " + + referencedFile.getName() + " but we expected it to reference " + + dataFile); + } + return new MD5Hash(storedHash); + } + + /** + * Read dataFile and compute its MD5 checksum. + */ + public static MD5Hash computeMd5ForFile(File dataFile) throws IOException { + InputStream in = new FileInputStream(dataFile); + try { + MessageDigest digester = MD5Hash.getDigester(); + DigestInputStream dis = new DigestInputStream(in, digester); + IOUtils.copyBytes(dis, new IOUtils.NullOutputStream(), 128*1024); + + return new MD5Hash(digester.digest()); + } finally { + IOUtils.closeStream(in); + } + } + + /** + * Save the ".md5" file that lists the md5sum of another file. + * @param dataFile the original file whose md5 was computed + * @param digest the computed digest + * @throws IOException + */ + public static void saveMD5File(File dataFile, MD5Hash digest) + throws IOException { + File md5File = getDigestFileForFile(dataFile); + String digestString = StringUtils.byteToHexString( + digest.getDigest()); + String md5Line = digestString + " *" + dataFile.getName() + "\n"; + + AtomicFileOutputStream afos = new AtomicFileOutputStream(md5File); + afos.write(md5Line.getBytes()); + afos.close(); + LOG.debug("Saved MD5 " + digest + " to " + md5File); + } + + /** + * @return a reference to the file with .md5 suffix that will + * contain the md5 checksum for the given data file. + */ + public static File getDigestFileForFile(File file) { + return new File(file.getParentFile(), file.getName() + MD5_SUFFIX); + } +} diff --git a/hdfs/src/test/findbugsExcludeFile.xml b/hdfs/src/test/findbugsExcludeFile.xml index 394c37e35f5..9b5d6df12e4 100644 --- a/hdfs/src/test/findbugsExcludeFile.xml +++ b/hdfs/src/test/findbugsExcludeFile.xml @@ -229,6 +229,15 @@ + + + + + +