diff --git a/hdfs/ivy/libraries.properties b/hdfs/ivy/libraries.properties
index 00e8331a2fe..8fc14d9d2da 100644
--- a/hdfs/ivy/libraries.properties
+++ b/hdfs/ivy/libraries.properties
@@ -34,6 +34,8 @@ commons-net.version=1.4.1
core.version=3.1.1
coreplugin.version=1.3.2
+guava.version=r09
+
hadoop-common.version=0.23.0-SNAPSHOT
hadoop-hdfs.version=0.23.0-SNAPSHOT
diff --git a/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml b/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
index 7193747055b..2bfa2e0bbdd 100644
--- a/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
+++ b/hdfs/src/docs/src/documentation/content/xdocs/hdfs_user_guide.xml
@@ -271,9 +271,9 @@
the maximum delay between two consecutive checkpoints, and
- dfs.namenode.checkpoint.size
, set to 64MB by default, defines the
- size of the edits log file that forces an urgent checkpoint even if
- the maximum checkpoint delay is not reached.
+ dfs.namenode.checkpoint.txns
, set to 40000 default, defines the
+ number of uncheckpointed transactions on the NameNode which will force
+ an urgent checkpoint, even if the checkpoint period has not been reached.
@@ -322,9 +322,9 @@
the maximum delay between two consecutive checkpoints
- dfs.namenode.checkpoint.size
, set to 64MB by default, defines the
- size of the edits log file that forces an urgent checkpoint even if
- the maximum checkpoint delay is not reached.
+ dfs.namenode.checkpoint.txns
, set to 40000 default, defines the
+ number of uncheckpointed transactions on the NameNode which will force
+ an urgent checkpoint, even if the checkpoint period has not been reached.
diff --git a/hdfs/src/java/hdfs-default.xml b/hdfs/src/java/hdfs-default.xml
index 7557d74eed7..08931de4c3f 100644
--- a/hdfs/src/java/hdfs-default.xml
+++ b/hdfs/src/java/hdfs-default.xml
@@ -582,10 +582,30 @@ creations/deletions), or "all".
- dfs.namenode.checkpoint.size
- 67108864
- The size of the current edit log (in bytes) that triggers
- a periodic checkpoint even if the dfs.namenode.checkpoint.period hasn't expired.
+ dfs.namenode.checkpoint.txns
+ 40000
+ The Secondary NameNode or CheckpointNode will create a checkpoint
+ of the namespace every 'dfs.namenode.checkpoint.txns' transactions, regardless
+ of whether 'dfs.namenode.checkpoint.period' has expired.
+
+
+
+
+ dfs.namenode.checkpoint.check.period
+ 60
+ The SecondaryNameNode and CheckpointNode will poll the NameNode
+ every 'dfs.namenode.checkpoint.check.period' seconds to query the number
+ of uncheckpointed transactions.
+
+
+
+
+ dfs.namenode.num.checkpoints.retained
+ 2
+ The number of image checkpoint files that will be retained by
+ the NameNode and Secondary NameNode in their storage directories. All edit
+ logs necessary to recover an up-to-date namespace from the oldest retained
+ checkpoint will also be retained.
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java b/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
index 94fbd9a00b3..b9b3a60e0f4 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
@@ -73,10 +73,12 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_NAMENODE_SAFEMODE_MIN_DATANODES_DEFAULT = 0;
public static final String DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY = "dfs.namenode.secondary.http-address";
public static final String DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_DEFAULT = "0.0.0.0:50090";
+ public static final String DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY = "dfs.namenode.checkpoint.check.period";
+ public static final long DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT = 60;
public static final String DFS_NAMENODE_CHECKPOINT_PERIOD_KEY = "dfs.namenode.checkpoint.period";
public static final long DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT = 3600;
- public static final String DFS_NAMENODE_CHECKPOINT_SIZE_KEY = "dfs.namenode.checkpoint.size";
- public static final long DFS_NAMENODE_CHECKPOINT_SIZE_DEFAULT = 4194304;
+ public static final String DFS_NAMENODE_CHECKPOINT_TXNS_KEY = "dfs.namenode.checkpoint.txns";
+ public static final long DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT = 40000;
public static final String DFS_NAMENODE_UPGRADE_PERMISSION_KEY = "dfs.namenode.upgrade.permission";
public static final int DFS_NAMENODE_UPGRADE_PERMISSION_DEFAULT = 00777;
public static final String DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY = "dfs.namenode.heartbeat.recheck-interval";
@@ -110,6 +112,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final boolean DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT = false;
public static final String DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_KEY = "dfs.namenode.support.allow.format";
public static final boolean DFS_NAMENODE_SUPPORT_ALLOW_FORMAT_DEFAULT = true;
+ public static final String DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY = "dfs.namenode.num.checkpoints.retained";
+ public static final int DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT = 2;
+
public static final String DFS_LIST_LIMIT = "dfs.ls.limit";
public static final int DFS_LIST_LIMIT_DEFAULT = 1000;
public static final String DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY = "dfs.datanode.failed.volumes.tolerated";
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java b/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java
index 33d053561a4..294e5852cbd 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/HdfsConfiguration.java
@@ -85,7 +85,6 @@ private static void addDeprecatedKeys() {
deprecate("fs.checkpoint.dir", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_DIR_KEY);
deprecate("fs.checkpoint.edits.dir", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_EDITS_DIR_KEY);
deprecate("fs.checkpoint.period", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY);
- deprecate("fs.checkpoint.size", DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_SIZE_KEY);
deprecate("dfs.upgrade.permission", DFSConfigKeys.DFS_NAMENODE_UPGRADE_PERMISSION_KEY);
deprecate("heartbeat.recheck.interval", DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY);
deprecate("StorageId", DFSConfigKeys.DFS_DATANODE_STORAGEID_KEY);
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java
index dc516ae3d94..5e803f2282b 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/FSConstants.java
@@ -65,6 +65,9 @@ public enum SafeModeAction{ SAFEMODE_LEAVE, SAFEMODE_ENTER, SAFEMODE_GET; }
// type of the datanode report
public static enum DatanodeReportType {ALL, LIVE, DEAD }
+
+ // An invalid transaction ID that will never be seen in a real namesystem.
+ public static final long INVALID_TXID = -12345;
/**
* Distributed upgrade actions:
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
index 0becfd827d2..a3d7ca7fad8 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/protocol/LayoutVersion.java
@@ -78,7 +78,9 @@ public static enum Feature {
RESERVED_REL22(-33, -27, "Reserved for release 0.22"),
RESERVED_REL23(-34, -30, "Reserved for release 0.23"),
FEDERATION(-35, "Support for namenode federation"),
- LEASE_REASSIGNMENT(-36, "Support for persisting lease holder reassignment");
+ LEASE_REASSIGNMENT(-36, "Support for persisting lease holder reassignment"),
+ STORED_TXIDS(-37, "Transaction IDs are stored in edits log and image files"),
+ TXID_BASED_LAYOUT(-38, "File names in NN Storage are based on transaction IDs");
final int lv;
final int ancestorLV;
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java
index 3d0a4194566..19ad35bb9a7 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/Storage.java
@@ -181,6 +181,16 @@ public Iterator dirIterator(StorageDirType dirType) {
return new DirIterator(dirType);
}
+ public Iterable dirIterable(final StorageDirType dirType) {
+ return new Iterable() {
+ @Override
+ public Iterator iterator() {
+ return dirIterator(dirType);
+ }
+ };
+ }
+
+
/**
* generate storage list (debug line)
*/
@@ -568,13 +578,17 @@ public void lock() throws IOException {
LOG.info("Locking is disabled");
return;
}
- this.lock = tryLock();
- if (lock == null) {
+ FileLock newLock = tryLock();
+ if (newLock == null) {
String msg = "Cannot lock storage " + this.root
+ ". The directory is already locked.";
LOG.info(msg);
throw new IOException(msg);
}
+ // Don't overwrite lock until success - this way if we accidentally
+ // call lock twice, the internal state won't be cleared by the second
+ // (failed) lock attempt
+ lock = newLock;
}
/**
@@ -614,6 +628,45 @@ public void unlock() throws IOException {
lock.channel().close();
lock = null;
}
+
+ @Override
+ public String toString() {
+ return "Storage Directory " + this.root;
+ }
+
+ /**
+ * Check whether underlying file system supports file locking.
+ *
+ * @return true
if exclusive locks are supported or
+ * false
otherwise.
+ * @throws IOException
+ * @see StorageDirectory#lock()
+ */
+ public boolean isLockSupported() throws IOException {
+ FileLock firstLock = null;
+ FileLock secondLock = null;
+ try {
+ firstLock = lock;
+ if(firstLock == null) {
+ firstLock = tryLock();
+ if(firstLock == null)
+ return true;
+ }
+ secondLock = tryLock();
+ if(secondLock == null)
+ return true;
+ } finally {
+ if(firstLock != null && firstLock != lock) {
+ firstLock.release();
+ firstLock.channel().close();
+ }
+ if(secondLock != null) {
+ secondLock.release();
+ secondLock.channel().close();
+ }
+ }
+ return false;
+ }
}
/**
@@ -829,41 +882,6 @@ public void unlockAll() throws IOException {
}
}
- /**
- * Check whether underlying file system supports file locking.
- *
- * @return true
if exclusive locks are supported or
- * false
otherwise.
- * @throws IOException
- * @see StorageDirectory#lock()
- */
- public boolean isLockSupported(int idx) throws IOException {
- StorageDirectory sd = storageDirs.get(idx);
- FileLock firstLock = null;
- FileLock secondLock = null;
- try {
- firstLock = sd.lock;
- if(firstLock == null) {
- firstLock = sd.tryLock();
- if(firstLock == null)
- return true;
- }
- secondLock = sd.tryLock();
- if(secondLock == null)
- return true;
- } finally {
- if(firstLock != null && firstLock != sd.lock) {
- firstLock.release();
- firstLock.channel().close();
- }
- if(secondLock != null) {
- secondLock.release();
- secondLock.channel().close();
- }
- }
- return false;
- }
-
public static String getBuildVersion() {
return VersionInfo.getRevision();
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java
index 5f152999ec8..95573f38f09 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/common/StorageInfo.java
@@ -25,6 +25,8 @@
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
+import com.google.common.base.Joiner;
+
/**
* Common class for storage information.
*
@@ -105,4 +107,9 @@ public String toString() {
.append(";nsid=").append(namespaceID).append(";c=").append(cTime);
return sb.toString();
}
+
+ public String toColonSeparatedString() {
+ return Joiner.on(":").join(
+ layoutVersion, namespaceID, cTime, clusterID);
+ }
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java
index e4ee76b7716..eca31fe4d04 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupImage.java
@@ -19,29 +19,21 @@
import java.io.BufferedInputStream;
import java.io.DataInputStream;
-import java.io.File;
import java.io.IOException;
-import java.net.URI;
-import java.util.Collection;
import java.util.Iterator;
-import java.util.zip.CheckedInputStream;
import java.util.zip.Checksum;
import org.apache.hadoop.classification.InterfaceAudience;
-import org.apache.hadoop.hdfs.protocol.LayoutVersion;
-import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.server.common.HdfsConstants;
+import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.Storage.StorageState;
-import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
-import static org.apache.hadoop.hdfs.server.common.Util.now;
-import org.apache.hadoop.hdfs.server.namenode.FSImage;
-import org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
-import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.LogLoadPlan;
+import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.util.StringUtils;
+
+import com.google.common.base.Preconditions;
/**
* Extension of FSImage for the backup node.
@@ -50,29 +42,56 @@
*/
@InterfaceAudience.Private
public class BackupImage extends FSImage {
- // Names of the journal spool directory and the spool file
- private static final String STORAGE_JSPOOL_DIR = "jspool";
- private static final String STORAGE_JSPOOL_FILE =
- NNStorage.NameNodeFile.EDITS_NEW.getName();
-
/** Backup input stream for loading edits into memory */
- private EditLogBackupInputStream backupInputStream;
-
- /** Is journal spooling in progress */
- volatile JSpoolState jsState;
-
- static enum JSpoolState {
- OFF,
- INPROGRESS,
- WAIT;
+ private EditLogBackupInputStream backupInputStream =
+ new EditLogBackupInputStream("Data from remote NameNode");
+
+ /**
+ * Current state of the BackupNode. The BackupNode's state
+ * transitions are as follows:
+ *
+ * Initial: DROP_UNTIL_NEXT_ROLL
+ * - Transitions to JOURNAL_ONLY the next time the log rolls
+ * - Transitions to IN_SYNC in convergeJournalSpool
+ * - Transitions back to JOURNAL_ONLY if the log rolls while
+ * stopApplyingOnNextRoll is true.
+ */
+ volatile BNState bnState;
+ static enum BNState {
+ /**
+ * Edits from the NN should be dropped. On the next log roll,
+ * transition to JOURNAL_ONLY state
+ */
+ DROP_UNTIL_NEXT_ROLL,
+ /**
+ * Edits from the NN should be written to the local edits log
+ * but not applied to the namespace.
+ */
+ JOURNAL_ONLY,
+ /**
+ * Edits should be written to the local edits log and applied
+ * to the local namespace.
+ */
+ IN_SYNC;
}
/**
+ * Flag to indicate that the next time the NN rolls, the BN
+ * should transition from to JOURNAL_ONLY state.
+ * {@see #freezeNamespaceAtNextRoll()}
*/
- BackupImage() {
- super();
+ private boolean stopApplyingEditsOnNextRoll = false;
+
+ /**
+ * Construct a backup image.
+ * @param conf Configuration
+ * @throws IOException if storage cannot be initialised.
+ */
+ BackupImage(Configuration conf) throws IOException {
+ super(conf);
storage.setDisablePreUpgradableLayoutCheck(true);
- jsState = JSpoolState.OFF;
+ bnState = BNState.DROP_UNTIL_NEXT_ROLL;
+ editLog.initJournals();
}
/**
@@ -81,14 +100,9 @@ static enum JSpoolState {
* Read VERSION and fstime files if exist.
* Do not load image or edits.
*
- * @param imageDirs list of image directories as URI.
- * @param editsDirs list of edits directories URI.
* @throws IOException if the node should shutdown.
*/
- void recoverCreateRead(Collection imageDirs,
- Collection editsDirs) throws IOException {
- storage.setStorageDirectories(imageDirs, editsDirs);
- storage.setCheckpointTime(0L);
+ void recoverCreateRead() throws IOException {
for (Iterator it = storage.dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
StorageState curState;
@@ -122,283 +136,261 @@ void recoverCreateRead(Collection imageDirs,
}
}
- /**
- * Reset storage directories.
- *
- * Unlock the storage.
- * Rename current
to lastcheckpoint.tmp
- * and recreate empty current
.
- * @throws IOException
- */
- synchronized void reset() throws IOException {
- // reset NameSpace tree
- FSDirectory fsDir = getFSNamesystem().dir;
- fsDir.reset();
-
- // unlock, close and rename storage directories
- storage.unlockAll();
- // recover from unsuccessful checkpoint if necessary
- recoverCreateRead(storage.getImageDirectories(),
- storage.getEditsDirectories());
- // rename and recreate
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- // rename current to lastcheckpoint.tmp
- storage.moveCurrent(sd);
- }
- }
-
- /**
- * Load checkpoint from local files only if the memory state is empty.
- * Set new checkpoint time received from the name-node.
- * Move lastcheckpoint.tmp
to previous.checkpoint
.
- * @throws IOException
- */
- void loadCheckpoint(CheckpointSignature sig) throws IOException {
- // load current image and journal if it is not in memory already
- if(!editLog.isOpen())
- editLog.open();
-
- FSDirectory fsDir = getFSNamesystem().dir;
- if(fsDir.isEmpty()) {
- Iterator itImage
- = storage.dirIterator(NameNodeDirType.IMAGE);
- Iterator itEdits
- = storage.dirIterator(NameNodeDirType.EDITS);
- if(!itImage.hasNext() || ! itEdits.hasNext())
- throw new IOException("Could not locate checkpoint directories");
- StorageDirectory sdName = itImage.next();
- StorageDirectory sdEdits = itEdits.next();
- getFSDirectoryRootLock().writeLock();
- try { // load image under rootDir lock
- loadFSImage(NNStorage.getStorageFile(sdName, NameNodeFile.IMAGE));
- } finally {
- getFSDirectoryRootLock().writeUnlock();
- }
- loadFSEdits(sdEdits);
- }
-
- // set storage fields
- storage.setStorageInfo(sig);
- storage.setImageDigest(sig.imageDigest);
- storage.setCheckpointTime(sig.checkpointTime);
- }
-
/**
* Save meta-data into fsimage files.
* and create empty edits.
*/
void saveCheckpoint() throws IOException {
- saveNamespace(false);
- }
-
- private FSDirectory getFSDirectoryRootLock() {
- return getFSNamesystem().dir;
- }
-
- static File getJSpoolDir(StorageDirectory sd) {
- return new File(sd.getRoot(), STORAGE_JSPOOL_DIR);
- }
-
- static File getJSpoolFile(StorageDirectory sd) {
- return new File(getJSpoolDir(sd), STORAGE_JSPOOL_FILE);
+ saveNamespace();
}
/**
- * Journal writer journals new meta-data state.
- *
- * - If Journal Spool state is OFF then journal records (edits)
- * are applied directly to meta-data state in memory and are written
- * to the edits file(s).
- * - If Journal Spool state is INPROGRESS then records are only
- * written to edits.new file, which is called Spooling.
- * - Journal Spool state WAIT blocks journaling until the
- * Journal Spool reader finalizes merging of the spooled data and
- * switches to applying journal to memory.
- *
- * @param length length of data.
+ * Receive a batch of edits from the NameNode.
+ *
+ * Depending on bnState, different actions are taken. See
+ * {@link BackupImage.BNState}
+ *
+ * @param firstTxId first txid in batch
+ * @param numTxns number of transactions
* @param data serialized journal records.
* @throws IOException
* @see #convergeJournalSpool()
*/
- synchronized void journal(int length, byte[] data) throws IOException {
- assert backupInputStream.length() == 0 : "backup input stream is not empty";
- try {
- switch(jsState) {
- case WAIT:
- case OFF:
- // wait until spooling is off
- waitSpoolEnd();
- // update NameSpace in memory
- backupInputStream.setBytes(data);
- FSEditLogLoader logLoader = new FSEditLogLoader(namesystem);
- int logVersion = storage.getLayoutVersion();
- BufferedInputStream bin = new BufferedInputStream(backupInputStream);
- DataInputStream in = new DataInputStream(bin);
- Checksum checksum = null;
- if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) {
- checksum = FSEditLog.getChecksum();
- in = new DataInputStream(new CheckedInputStream(bin, checksum));
- }
- logLoader.loadEditRecords(logVersion, in, checksum, true);
- getFSNamesystem().dir.updateCountForINodeWithQuota(); // inefficient!
- break;
- case INPROGRESS:
- break;
- }
- // write to files
- editLog.logEdit(length, data);
- editLog.logSync();
- } finally {
- backupInputStream.clear();
+ synchronized void journal(long firstTxId, int numTxns, byte[] data) throws IOException {
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Got journal, " +
+ "state = " + bnState +
+ "; firstTxId = " + firstTxId +
+ "; numTxns = " + numTxns);
}
- }
-
- private synchronized void waitSpoolEnd() {
- while(jsState == JSpoolState.WAIT) {
- try {
- wait();
- } catch (InterruptedException e) {}
- }
- // now spooling should be off, verifying just in case
- assert jsState == JSpoolState.OFF : "Unexpected JSpool state: " + jsState;
- }
-
- /**
- * Start journal spool.
- * Switch to writing into edits.new instead of edits.
- *
- * edits.new for spooling is in separate directory "spool" rather than in
- * "current" because the two directories should be independent.
- * While spooling a checkpoint can happen and current will first
- * move to lastcheckpoint.tmp and then to previous.checkpoint
- * spool/edits.new will remain in place during that.
- */
- synchronized void startJournalSpool(NamenodeRegistration nnReg)
- throws IOException {
- switch(jsState) {
- case OFF:
- break;
- case INPROGRESS:
+
+ switch(bnState) {
+ case DROP_UNTIL_NEXT_ROLL:
return;
- case WAIT:
- waitSpoolEnd();
+
+ case IN_SYNC:
+ // update NameSpace in memory
+ applyEdits(firstTxId, numTxns, data);
+ break;
+
+ case JOURNAL_ONLY:
+ break;
+
+ default:
+ throw new AssertionError("Unhandled state: " + bnState);
}
-
- // create journal spool directories
- for (Iterator it
- = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
- StorageDirectory sd = it.next();
- File jsDir = getJSpoolDir(sd);
- if (!jsDir.exists() && !jsDir.mkdirs()) {
- throw new IOException("Mkdirs failed to create "
- + jsDir.getCanonicalPath());
- }
- // create edit file if missing
- File eFile = storage.getEditFile(sd);
- if(!eFile.exists()) {
- editLog.createEditLogFile(eFile);
- }
- }
-
- if(!editLog.isOpen())
- editLog.open();
-
- // create streams pointing to the journal spool files
- // subsequent journal records will go directly to the spool
- editLog.divertFileStreams(STORAGE_JSPOOL_DIR + "/" + STORAGE_JSPOOL_FILE);
- setCheckpointState(CheckpointStates.ROLLED_EDITS);
-
- // set up spooling
- if(backupInputStream == null)
- backupInputStream = new EditLogBackupInputStream(nnReg.getAddress());
- jsState = JSpoolState.INPROGRESS;
+
+ // write to BN's local edit log.
+ logEditsLocally(firstTxId, numTxns, data);
}
- synchronized void setCheckpointTime(int length, byte[] data)
- throws IOException {
+ /**
+ * Write the batch of edits to the local copy of the edit logs.
+ */
+ private void logEditsLocally(long firstTxId, int numTxns, byte[] data) {
+ long expectedTxId = editLog.getLastWrittenTxId() + 1;
+ Preconditions.checkState(firstTxId == expectedTxId,
+ "received txid batch starting at %s but expected txn %s",
+ firstTxId, expectedTxId);
+ editLog.setNextTxId(firstTxId + numTxns - 1);
+ editLog.logEdit(data.length, data);
+ editLog.logSync();
+ }
+
+ /**
+ * Apply the batch of edits to the local namespace.
+ */
+ private synchronized void applyEdits(long firstTxId, int numTxns, byte[] data)
+ throws IOException {
+ Preconditions.checkArgument(firstTxId == lastAppliedTxId + 1,
+ "Received txn batch starting at %s but expected %s",
+ firstTxId, lastAppliedTxId + 1);
assert backupInputStream.length() == 0 : "backup input stream is not empty";
try {
- // unpack new checkpoint time
+ if (LOG.isTraceEnabled()) {
+ LOG.debug("data:" + StringUtils.byteToHexString(data));
+ }
backupInputStream.setBytes(data);
- DataInputStream in = backupInputStream.getDataInputStream();
- byte op = in.readByte();
- assert op == NamenodeProtocol.JA_CHECKPOINT_TIME;
- LongWritable lw = new LongWritable();
- lw.readFields(in);
- storage.setCheckpointTimeInStorage(lw.get());
+ FSEditLogLoader logLoader = new FSEditLogLoader(namesystem);
+ int logVersion = storage.getLayoutVersion();
+ BufferedInputStream bin = new BufferedInputStream(backupInputStream);
+ DataInputStream in = new DataInputStream(bin);
+ Checksum checksum = FSEditLog.getChecksum();
+ int numLoaded = logLoader.loadEditRecords(logVersion, in, checksum, true,
+ lastAppliedTxId + 1);
+ if (numLoaded != numTxns) {
+ throw new IOException("Batch of txns starting at txnid " +
+ firstTxId + " was supposed to contain " + numTxns +
+ " transactions but only was able to apply " + numLoaded);
+ }
+ lastAppliedTxId += numTxns;
+
+ getFSNamesystem().dir.updateCountForINodeWithQuota(); // inefficient!
} finally {
backupInputStream.clear();
}
}
/**
- * Merge Journal Spool to memory.
- * Journal Spool reader reads journal records from edits.new.
- * When it reaches the end of the file it sets {@link JSpoolState} to WAIT.
- * This blocks journaling (see {@link #journal(int,byte[])}.
- * The reader
- *
- * - reads remaining journal records if any,
- * - renames edits.new to edits,
- * - sets {@link JSpoolState} to OFF,
- * - and notifies the journaling thread.
- *
- * Journaling resumes with applying new journal records to the memory state,
- * and writing them into edits file(s).
+ * Transition the BackupNode from JOURNAL_ONLY state to IN_SYNC state.
+ * This is done by repeated invocations of tryConvergeJournalSpool until
+ * we are caught up to the latest in-progress edits file.
*/
void convergeJournalSpool() throws IOException {
- Iterator itEdits
- = storage.dirIterator(NameNodeDirType.EDITS);
- if(! itEdits.hasNext())
- throw new IOException("Could not locate checkpoint directories");
- StorageDirectory sdEdits = itEdits.next();
- int numEdits = 0;
- File jSpoolFile = getJSpoolFile(sdEdits);
- long startTime = now();
- if(jSpoolFile.exists()) {
- // load edits.new
- EditLogFileInputStream edits = new EditLogFileInputStream(jSpoolFile);
- BufferedInputStream bin = new BufferedInputStream(edits);
- DataInputStream in = new DataInputStream(bin);
- FSEditLogLoader logLoader = new FSEditLogLoader(namesystem);
- int logVersion = logLoader.readLogVersion(in);
- Checksum checksum = null;
- if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) {
- checksum = FSEditLog.getChecksum();
- in = new DataInputStream(new CheckedInputStream(bin, checksum));
- }
- numEdits += logLoader.loadEditRecords(logVersion, in, checksum, false);
+ Preconditions.checkState(bnState == BNState.JOURNAL_ONLY,
+ "bad state: %s", bnState);
- // first time reached the end of spool
- jsState = JSpoolState.WAIT;
- numEdits += logLoader.loadEditRecords(logVersion,
- in, checksum, true);
- getFSNamesystem().dir.updateCountForINodeWithQuota();
- edits.close();
+ while (!tryConvergeJournalSpool()) {
+ ;
}
+ assert bnState == BNState.IN_SYNC;
+ }
+
+ private boolean tryConvergeJournalSpool() throws IOException {
+ Preconditions.checkState(bnState == BNState.JOURNAL_ONLY,
+ "bad state: %s", bnState);
+
+ // This section is unsynchronized so we can continue to apply
+ // ahead of where we're reading, concurrently. Since the state
+ // is JOURNAL_ONLY at this point, we know that lastAppliedTxId
+ // doesn't change, and curSegmentTxId only increases
- FSImage.LOG.info("Edits file " + jSpoolFile.getCanonicalPath()
- + " of size " + jSpoolFile.length() + " edits # " + numEdits
- + " loaded in " + (now()-startTime)/1000 + " seconds.");
+ while (lastAppliedTxId < editLog.getCurSegmentTxId() - 1) {
+ long target = editLog.getCurSegmentTxId();
+ LOG.info("Loading edits into backupnode to try to catch up from txid "
+ + lastAppliedTxId + " to " + target);
+ FSImageTransactionalStorageInspector inspector =
+ new FSImageTransactionalStorageInspector();
+
+ storage.inspectStorageDirs(inspector);
+ LogLoadPlan logLoadPlan = inspector.createLogLoadPlan(lastAppliedTxId,
+ target - 1);
+
+ logLoadPlan.doRecovery();
+ loadEdits(logLoadPlan.getEditsFiles());
+ }
+
+ // now, need to load the in-progress file
+ synchronized (this) {
+ if (lastAppliedTxId != editLog.getCurSegmentTxId() - 1) {
+ LOG.debug("Logs rolled while catching up to current segment");
+ return false; // drop lock and try again to load local logs
+ }
+
+ EditLogInputStream stream = getEditLog().getInProgressFileInputStream();
+ try {
+ long remainingTxns = getEditLog().getLastWrittenTxId() - lastAppliedTxId;
+
+ LOG.info("Going to finish converging with remaining " + remainingTxns
+ + " txns from in-progress stream " + stream);
+
+ FSEditLogLoader loader = new FSEditLogLoader(namesystem);
+ int numLoaded = loader.loadFSEdits(stream, lastAppliedTxId + 1);
+ lastAppliedTxId += numLoaded;
+ assert numLoaded == remainingTxns :
+ "expected to load " + remainingTxns + " but loaded " +
+ numLoaded + " from " + stream;
+ } finally {
+ IOUtils.closeStream(stream);
+ }
- // rename spool edits.new to edits making it in sync with the active node
- // subsequent journal records will go directly to edits
- editLog.revertFileStreams(STORAGE_JSPOOL_DIR + "/" + STORAGE_JSPOOL_FILE);
+ LOG.info("Successfully synced BackupNode with NameNode at txnid " +
+ lastAppliedTxId);
+ setState(BNState.IN_SYNC);
+ }
+ return true;
+ }
- // write version file
- resetVersion(false, storage.getImageDigest());
+ /**
+ * Transition edit log to a new state, logging as necessary.
+ */
+ private synchronized void setState(BNState newState) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("State transition " + bnState + " -> " + newState,
+ new Exception("trace"));
+ }
+ bnState = newState;
+ }
- // wake up journal writer
- synchronized(this) {
- jsState = JSpoolState.OFF;
+ /**
+ * Receive a notification that the NameNode has begun a new edit log.
+ * This causes the BN to also start the new edit log in its local
+ * directories.
+ */
+ synchronized void namenodeStartedLogSegment(long txid)
+ throws IOException {
+ LOG.info("NameNode started a new log segment at txid " + txid);
+ if (editLog.isOpen()) {
+ if (editLog.getLastWrittenTxId() == txid - 1) {
+ // We are in sync with the NN, so end and finalize the current segment
+ editLog.endCurrentLogSegment(false);
+ } else {
+ // We appear to have missed some transactions -- the NN probably
+ // lost contact with us temporarily. So, mark the current segment
+ // as aborted.
+ LOG.warn("NN started new log segment at txid " + txid +
+ ", but BN had only written up to txid " +
+ editLog.getLastWrittenTxId() +
+ "in the log segment starting at " +
+ editLog.getCurSegmentTxId() + ". Aborting this " +
+ "log segment.");
+ editLog.abortCurrentLogSegment();
+ }
+ }
+ editLog.setNextTxId(txid);
+ editLog.startLogSegment(txid, false);
+ if (bnState == BNState.DROP_UNTIL_NEXT_ROLL) {
+ setState(BNState.JOURNAL_ONLY);
+ }
+
+ if (stopApplyingEditsOnNextRoll) {
+ if (bnState == BNState.IN_SYNC) {
+ LOG.info("Stopped applying edits to prepare for checkpoint.");
+ setState(BNState.JOURNAL_ONLY);
+ }
+ stopApplyingEditsOnNextRoll = false;
notifyAll();
}
+ }
- // Rename lastcheckpoint.tmp to previous.checkpoint
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- storage.moveLastCheckpoint(sd);
+ /**
+ * Request that the next time the BN receives a log roll, it should
+ * stop applying the edits log to the local namespace. This is
+ * typically followed on by a call to {@link #waitUntilNamespaceFrozen()}
+ */
+ synchronized void freezeNamespaceAtNextRoll() {
+ stopApplyingEditsOnNextRoll = true;
+ }
+
+ /**
+ * After {@link #freezeNamespaceAtNextRoll()} has been called, wait until
+ * the BN receives notification of the next log roll.
+ */
+ synchronized void waitUntilNamespaceFrozen() throws IOException {
+ if (bnState != BNState.IN_SYNC) return;
+
+ LOG.info("Waiting until the NameNode rolls its edit logs in order " +
+ "to freeze the BackupNode namespace.");
+ while (bnState == BNState.IN_SYNC) {
+ Preconditions.checkState(stopApplyingEditsOnNextRoll,
+ "If still in sync, we should still have the flag set to " +
+ "freeze at next roll");
+ try {
+ wait();
+ } catch (InterruptedException ie) {
+ LOG.warn("Interrupted waiting for namespace to freeze", ie);
+ throw new IOException(ie);
+ }
}
+ LOG.info("BackupNode namespace frozen.");
+ }
+
+ /**
+ * Override close() so that we don't finalize edit logs.
+ */
+ @Override
+ public synchronized void close() throws IOException {
+ editLog.abortCurrentLogSegment();
+ storage.close();
}
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java
new file mode 100644
index 00000000000..35c4b7384f7
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupJournalManager.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
+
+/**
+ * A JournalManager implementation that uses RPCs to log transactions
+ * to a BackupNode.
+ */
+class BackupJournalManager implements JournalManager {
+
+ private final NamenodeRegistration nnReg;
+ private final NamenodeRegistration bnReg;
+
+ BackupJournalManager(NamenodeRegistration bnReg,
+ NamenodeRegistration nnReg) {
+ this.bnReg = bnReg;
+ this.nnReg = nnReg;
+ }
+
+ @Override
+ public EditLogOutputStream startLogSegment(long txId) throws IOException {
+ EditLogBackupOutputStream stm = new EditLogBackupOutputStream(bnReg, nnReg);
+ stm.startLogSegment(txId);
+ return stm;
+ }
+
+ @Override
+ public void finalizeLogSegment(long firstTxId, long lastTxId)
+ throws IOException {
+ }
+
+ @Override
+ public void setOutputBufferCapacity(int size) {
+ }
+
+ @Override
+ public void purgeLogsOlderThan(long minTxIdToKeep, StoragePurger purger)
+ throws IOException {
+ }
+
+ public boolean matchesRegistration(NamenodeRegistration bnReg) {
+ return bnReg.getAddress().equals(this.bnReg.getAddress());
+ }
+
+ @Override
+ public EditLogInputStream getInProgressInputStream(long segmentStartsAtTxId) {
+ return null;
+ }
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java
index 2e7f48f2843..ad575b9e27a 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java
@@ -22,21 +22,20 @@
import java.net.SocketTimeoutException;
import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.FSConstants;
+import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
+import org.apache.hadoop.hdfs.server.common.Storage;
+import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
-import org.apache.hadoop.hdfs.server.common.Storage;
-import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
-import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature;
-import org.apache.hadoop.hdfs.server.namenode.FSImage.CheckpointStates;
-import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.ipc.RPC;
-import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.net.NetUtils;
/**
@@ -53,7 +52,7 @@
*
*/
@InterfaceAudience.Private
-public class BackupNode extends NameNode {
+public class BackupNode extends NameNode implements JournalProtocol {
private static final String BN_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY;
private static final String BN_ADDRESS_DEFAULT = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT;
private static final String BN_HTTP_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY;
@@ -119,10 +118,9 @@ protected void setHttpServerAddress(Configuration conf){
@Override // NameNode
protected void loadNamesystem(Configuration conf) throws IOException {
- BackupImage bnImage = new BackupImage();
+ BackupImage bnImage = new BackupImage(conf);
this.namesystem = new FSNamesystem(conf, bnImage);
- bnImage.recoverCreateRead(FSNamesystem.getNamespaceDirs(conf),
- FSNamesystem.getNamespaceEditsDirs(conf));
+ bnImage.recoverCreateRead();
}
@Override // NameNode
@@ -179,6 +177,17 @@ public void stop() {
super.stop();
}
+
+ @Override
+ public long getProtocolVersion(String protocol, long clientVersion)
+ throws IOException {
+ if (protocol.equals(JournalProtocol.class.getName())) {
+ return JournalProtocol.versionID;
+ } else {
+ return super.getProtocolVersion(protocol, clientVersion);
+ }
+ }
+
/////////////////////////////////////////////////////
// NamenodeProtocol implementation for backup node.
/////////////////////////////////////////////////////
@@ -205,34 +214,36 @@ public NamenodeCommand startCheckpoint(NamenodeRegistration registration)
public void endCheckpoint(NamenodeRegistration registration,
CheckpointSignature sig) throws IOException {
throw new UnsupportedActionException("endCheckpoint");
- }
+ }
- @Override // NamenodeProtocol
+ /////////////////////////////////////////////////////
+ // BackupNodeProtocol implementation for backup node.
+ /////////////////////////////////////////////////////
+
+ @Override
public void journal(NamenodeRegistration nnReg,
- int jAction,
- int length,
- byte[] args) throws IOException {
+ long firstTxId, int numTxns,
+ byte[] records) throws IOException {
verifyRequest(nnReg);
if(!nnRpcAddress.equals(nnReg.getAddress()))
throw new IOException("Journal request from unexpected name-node: "
+ nnReg.getAddress() + " expecting " + nnRpcAddress);
- BackupImage bnImage = (BackupImage)getFSImage();
- switch(jAction) {
- case (int)JA_IS_ALIVE:
- return;
- case (int)JA_JOURNAL:
- bnImage.journal(length, args);
- return;
- case (int)JA_JSPOOL_START:
- bnImage.startJournalSpool(nnReg);
- return;
- case (int)JA_CHECKPOINT_TIME:
- bnImage.setCheckpointTime(length, args);
- setRegistration(); // keep registration up to date
- return;
- default:
- throw new IOException("Unexpected journal action: " + jAction);
- }
+ getBNImage().journal(firstTxId, numTxns, records);
+ }
+
+ @Override
+ public void startLogSegment(NamenodeRegistration registration, long txid)
+ throws IOException {
+ verifyRequest(registration);
+
+ getBNImage().namenodeStartedLogSegment(txid);
+ }
+
+ //////////////////////////////////////////////////////
+
+
+ BackupImage getBNImage() {
+ return (BackupImage)getFSImage();
}
boolean shouldCheckpointAtStartup() {
@@ -241,9 +252,9 @@ boolean shouldCheckpointAtStartup() {
assert fsImage.getStorage().getNumStorageDirs() > 0;
return ! fsImage.getStorage().getStorageDir(0).getVersionFile().exists();
}
- if(namesystem == null || namesystem.dir == null || getFSImage() == null)
- return true;
- return fsImage.getEditLog().getNumEditStreams() == 0;
+
+ // BN always checkpoints on startup in order to get in sync with namespace
+ return true;
}
private NamespaceInfo handshake(Configuration conf) throws IOException {
@@ -287,14 +298,6 @@ void doCheckpoint() throws IOException {
checkpointManager.doCheckpoint();
}
- CheckpointStates getCheckpointState() {
- return getFSImage().getCheckpointState();
- }
-
- void setCheckpointState(CheckpointStates cs) {
- getFSImage().setCheckpointState(cs);
- }
-
/**
* Register this backup node with the active name-node.
* @param nsInfo
@@ -302,14 +305,15 @@ void setCheckpointState(CheckpointStates cs) {
*/
private void registerWith(NamespaceInfo nsInfo) throws IOException {
BackupImage bnImage = (BackupImage)getFSImage();
+ NNStorage storage = bnImage.getStorage();
// verify namespaceID
- if(bnImage.getStorage().getNamespaceID() == 0) // new backup storage
- bnImage.getStorage().setStorageInfo(nsInfo);
- else if(bnImage.getStorage().getNamespaceID() != nsInfo.getNamespaceID())
- throw new IOException("Incompatible namespaceIDs"
- + ": active node namespaceID = " + nsInfo.getNamespaceID()
- + "; backup node namespaceID = " + bnImage.getStorage().getNamespaceID());
-
+ if (storage.getNamespaceID() == 0) { // new backup storage
+ storage.setStorageInfo(nsInfo);
+ storage.setBlockPoolID(nsInfo.getBlockPoolID());
+ storage.setClusterID(nsInfo.getClusterID());
+ } else {
+ nsInfo.validateStorage(storage);
+ }
setRegistration();
NamenodeRegistration nnReg = null;
while(!isStopRequested()) {
@@ -338,23 +342,6 @@ else if(!nnReg.isRole(NamenodeRole.NAMENODE)) {
nnRpcAddress = nnReg.getAddress();
}
- /**
- * Reset node namespace state in memory and in storage directories.
- * @throws IOException
- */
- void resetNamespace() throws IOException {
- ((BackupImage)getFSImage()).reset();
- }
-
- /**
- * Get size of the local journal (edit log).
- * @return size of the current journal
- * @throws IOException
- */
- long journalSize() throws IOException {
- return namesystem.getEditLogSize();
- }
-
// TODO: move to a common with DataNode util class
private static NamespaceInfo handshake(NamenodeProtocol namenode)
throws IOException, SocketTimeoutException {
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java
index 7a9d970f92a..0db5cb11138 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/CheckpointSignature.java
@@ -24,10 +24,11 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.namenode.FSImage;
-import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableUtils;
+import com.google.common.collect.ComparisonChain;
+
/**
* A unique signature intended to identify checkpoint transactions.
*/
@@ -35,41 +36,35 @@
public class CheckpointSignature extends StorageInfo
implements WritableComparable {
private static final String FIELD_SEPARATOR = ":";
- long editsTime = -1L;
- long checkpointTime = -1L;
- MD5Hash imageDigest = null;
+ private static final int NUM_FIELDS = 7;
+
String blockpoolID = "";
+
+ long mostRecentCheckpointTxId;
+ long curSegmentTxId;
public CheckpointSignature() {}
CheckpointSignature(FSImage fsImage) {
super(fsImage.getStorage());
blockpoolID = fsImage.getBlockPoolID();
- editsTime = fsImage.getEditLog().getFsEditTime();
- checkpointTime = fsImage.getStorage().getCheckpointTime();
- imageDigest = fsImage.getStorage().getImageDigest();
- checkpointTime = fsImage.getStorage().getCheckpointTime();
+
+ mostRecentCheckpointTxId = fsImage.getStorage().getMostRecentCheckpointTxId();
+ curSegmentTxId = fsImage.getEditLog().getCurSegmentTxId();
}
CheckpointSignature(String str) {
String[] fields = str.split(FIELD_SEPARATOR);
- assert fields.length == 8 : "Must be 8 fields in CheckpointSignature";
- layoutVersion = Integer.valueOf(fields[0]);
- namespaceID = Integer.valueOf(fields[1]);
- cTime = Long.valueOf(fields[2]);
- editsTime = Long.valueOf(fields[3]);
- checkpointTime = Long.valueOf(fields[4]);
- imageDigest = new MD5Hash(fields[5]);
- clusterID = fields[6];
- blockpoolID = fields[7];
- }
-
- /**
- * Get the MD5 image digest
- * @return the MD5 image digest
- */
- MD5Hash getImageDigest() {
- return imageDigest;
+ assert fields.length == NUM_FIELDS :
+ "Must be " + NUM_FIELDS + " fields in CheckpointSignature";
+ int i = 0;
+ layoutVersion = Integer.valueOf(fields[i++]);
+ namespaceID = Integer.valueOf(fields[i++]);
+ cTime = Long.valueOf(fields[i++]);
+ mostRecentCheckpointTxId = Long.valueOf(fields[i++]);
+ curSegmentTxId = Long.valueOf(fields[i++]);
+ clusterID = fields[i++];
+ blockpoolID = fields[i++];
}
/**
@@ -101,33 +96,26 @@ public String toString() {
return String.valueOf(layoutVersion) + FIELD_SEPARATOR
+ String.valueOf(namespaceID) + FIELD_SEPARATOR
+ String.valueOf(cTime) + FIELD_SEPARATOR
- + String.valueOf(editsTime) + FIELD_SEPARATOR
- + String.valueOf(checkpointTime) + FIELD_SEPARATOR
- + imageDigest.toString() + FIELD_SEPARATOR
+ + String.valueOf(mostRecentCheckpointTxId) + FIELD_SEPARATOR
+ + String.valueOf(curSegmentTxId) + FIELD_SEPARATOR
+ clusterID + FIELD_SEPARATOR
+ blockpoolID ;
}
void validateStorageInfo(FSImage si) throws IOException {
- if(layoutVersion != si.getLayoutVersion()
- || namespaceID != si.getNamespaceID()
- || cTime != si.getStorage().cTime
- || checkpointTime != si.getStorage().getCheckpointTime()
- || !imageDigest.equals(si.getStorage().imageDigest)
- || !clusterID.equals(si.getClusterID())
- || !blockpoolID.equals(si.getBlockPoolID())) {
- // checkpointTime can change when the image is saved - do not compare
+ if(layoutVersion != si.getStorage().layoutVersion
+ || namespaceID != si.getStorage().namespaceID
+ || cTime != si.getStorage().cTime
+ || !clusterID.equals(si.getClusterID())
+ || !blockpoolID.equals(si.getBlockPoolID())) {
throw new IOException("Inconsistent checkpoint fields.\n"
+ "LV = " + layoutVersion + " namespaceID = " + namespaceID
- + " cTime = " + cTime + "; checkpointTime = " + checkpointTime
- + " ; imageDigest = " + imageDigest
+ + " cTime = " + cTime
+ " ; clusterId = " + clusterID
+ " ; blockpoolId = " + blockpoolID
+ ".\nExpecting respectively: "
- + si.getLayoutVersion() + "; "
- + si.getNamespaceID() + "; " + si.getStorage().cTime
- + "; " + si.getStorage().getCheckpointTime() + "; "
- + si.getStorage().imageDigest
+ + si.getStorage().layoutVersion + "; "
+ + si.getStorage().namespaceID + "; " + si.getStorage().cTime
+ "; " + si.getClusterID() + "; "
+ si.getBlockPoolID() + ".");
}
@@ -137,19 +125,15 @@ void validateStorageInfo(FSImage si) throws IOException {
// Comparable interface
//
public int compareTo(CheckpointSignature o) {
- return
- (layoutVersion < o.layoutVersion) ? -1 :
- (layoutVersion > o.layoutVersion) ? 1 :
- (namespaceID < o.namespaceID) ? -1 : (namespaceID > o.namespaceID) ? 1 :
- (cTime < o.cTime) ? -1 : (cTime > o.cTime) ? 1 :
- (editsTime < o.editsTime) ? -1 : (editsTime > o.editsTime) ? 1 :
- (checkpointTime < o.checkpointTime) ? -1 :
- (checkpointTime > o.checkpointTime) ? 1 :
- (clusterID.compareTo(o.clusterID) < 0) ? -1 :
- (clusterID.compareTo(o.clusterID) > 0) ? 1 :
- (blockpoolID.compareTo(o.blockpoolID) < 0) ? -1 :
- (blockpoolID.compareTo(o.blockpoolID) > 0) ? 1 :
- imageDigest.compareTo(o.imageDigest);
+ return ComparisonChain.start()
+ .compare(layoutVersion, o.layoutVersion)
+ .compare(namespaceID, o.namespaceID)
+ .compare(cTime, o.cTime)
+ .compare(mostRecentCheckpointTxId, o.mostRecentCheckpointTxId)
+ .compare(curSegmentTxId, o.curSegmentTxId)
+ .compare(clusterID, o.clusterID)
+ .compare(blockpoolID, o.blockpoolID)
+ .result();
}
public boolean equals(Object o) {
@@ -161,9 +145,8 @@ public boolean equals(Object o) {
public int hashCode() {
return layoutVersion ^ namespaceID ^
- (int)(cTime ^ editsTime ^ checkpointTime) ^
- imageDigest.hashCode() ^ clusterID.hashCode()
- ^ blockpoolID.hashCode();
+ (int)(cTime ^ mostRecentCheckpointTxId ^ curSegmentTxId)
+ ^ clusterID.hashCode() ^ blockpoolID.hashCode();
}
/////////////////////////////////////////////////
@@ -172,17 +155,14 @@ public int hashCode() {
public void write(DataOutput out) throws IOException {
super.write(out);
WritableUtils.writeString(out, blockpoolID);
- out.writeLong(editsTime);
- out.writeLong(checkpointTime);
- imageDigest.write(out);
+ out.writeLong(mostRecentCheckpointTxId);
+ out.writeLong(curSegmentTxId);
}
public void readFields(DataInput in) throws IOException {
super.readFields(in);
blockpoolID = WritableUtils.readString(in);
- editsTime = in.readLong();
- checkpointTime = in.readLong();
- imageDigest = new MD5Hash();
- imageDigest.readFields(in);
+ mostRecentCheckpointTxId = in.readLong();
+ curSegmentTxId = in.readLong();
}
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
index ee5e87c9aa8..49f713015c4 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
@@ -17,29 +17,30 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
-import java.io.IOException;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_DEFAULT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY;
+import static org.apache.hadoop.hdfs.server.common.Util.now;
+
import java.io.File;
+import java.io.IOException;
import java.net.InetSocketAddress;
-import java.util.Collection;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
-
-import org.apache.hadoop.hdfs.protocol.FSConstants;
-import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
-import static org.apache.hadoop.hdfs.server.common.Util.now;
-import org.apache.hadoop.hdfs.server.namenode.FSImage.CheckpointStates;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
-import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
-import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
+import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
+import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.http.HttpServer;
+import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.util.Daemon;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY;
-import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_BACKUP_HTTP_ADDRESS_DEFAULT;
+
+import com.google.common.collect.Lists;
/**
* The Checkpointer is responsible for supporting periodic checkpoints
@@ -59,7 +60,7 @@ class Checkpointer extends Daemon {
private BackupNode backupNode;
volatile boolean shouldRun;
private long checkpointPeriod; // in seconds
- private long checkpointSize; // size (in MB) of current Edit Log
+ private long checkpointTxnCount; // size (in MB) of current Edit Log
private String infoBindAddress;
@@ -79,6 +80,7 @@ private NamenodeProtocol getNamenode(){
try {
initialize(conf);
} catch(IOException e) {
+ LOG.warn("Checkpointer got exception", e);
shutdown();
throw e;
}
@@ -87,6 +89,7 @@ private NamenodeProtocol getNamenode(){
/**
* Initialize checkpoint.
*/
+ @SuppressWarnings("deprecation")
private void initialize(Configuration conf) throws IOException {
// Create connection to the namenode.
shouldRun = true;
@@ -94,8 +97,9 @@ private void initialize(Configuration conf) throws IOException {
// Initialize other scheduling parameters from the configuration
checkpointPeriod = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
- checkpointSize = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_SIZE_KEY,
- DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_SIZE_DEFAULT);
+ checkpointTxnCount = conf.getLong(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
+ SecondaryNameNode.warnForDeprecatedConfigs(conf);
// Pull out exact http address for posting url to avoid ip aliasing issues
String fullInfoAddr = conf.get(DFS_NAMENODE_BACKUP_HTTP_ADDRESS_KEY,
@@ -104,8 +108,7 @@ private void initialize(Configuration conf) throws IOException {
LOG.info("Checkpoint Period : " + checkpointPeriod + " secs " +
"(" + checkpointPeriod/60 + " min)");
- LOG.info("Log Size Trigger : " + checkpointSize + " bytes " +
- "(" + checkpointSize/1024 + " KB)");
+ LOG.info("Log Size Trigger : " + checkpointTxnCount + " txns ");
}
/**
@@ -128,8 +131,9 @@ public void run() {
periodMSec *= 1000;
long lastCheckpointTime = 0;
- if(!backupNode.shouldCheckpointAtStartup())
+ if (!backupNode.shouldCheckpointAtStartup()) {
lastCheckpointTime = now();
+ }
while(shouldRun) {
try {
long now = now();
@@ -137,8 +141,8 @@ public void run() {
if(now >= lastCheckpointTime + periodMSec) {
shouldCheckpoint = true;
} else {
- long size = getJournalSize();
- if(size >= checkpointSize)
+ long txns = countUncheckpointedTxns();
+ if(txns >= checkpointTxnCount)
shouldCheckpoint = true;
}
if(shouldCheckpoint) {
@@ -160,64 +164,24 @@ public void run() {
}
}
- private long getJournalSize() throws IOException {
- // If BACKUP node has been loaded
- // get edits size from the local file. ACTIVE has the same.
- if(backupNode.isRole(NamenodeRole.BACKUP)
- && getFSImage().getEditLog().isOpen())
- return backupNode.journalSize();
- // Go to the ACTIVE node for its size
- return getNamenode().journalSize(backupNode.getRegistration());
- }
-
- /**
- * Download fsimage
and edits
- * files from the remote name-node.
- */
- private void downloadCheckpoint(CheckpointSignature sig) throws IOException {
- // Retrieve image file
- String fileid = "getimage=1";
- Collection list = getFSImage()
- .getStorage().getFiles(NameNodeFile.IMAGE, NameNodeDirType.IMAGE);
- File[] files = list.toArray(new File[list.size()]);
- assert files.length > 0 : "No checkpoint targets.";
- String nnHttpAddr = backupNode.nnHttpAddress;
- TransferFsImage.getFileClient(nnHttpAddr, fileid, files, false);
- LOG.info("Downloaded file " + files[0].getName() + " size " +
- files[0].length() + " bytes.");
-
- // Retrieve edits file
- fileid = "getedit=1";
- list = getFSImage()
- .getStorage().getFiles(NameNodeFile.EDITS, NameNodeDirType.EDITS);
- files = list.toArray(new File[list.size()]);
- assert files.length > 0 : "No checkpoint targets.";
- TransferFsImage.getFileClient(nnHttpAddr, fileid, files, false);
- LOG.info("Downloaded file " + files[0].getName() + " size " +
- files[0].length() + " bytes.");
- }
-
- /**
- * Copy the new image into remote name-node.
- */
- private void uploadCheckpoint(CheckpointSignature sig) throws IOException {
- // Use the exact http addr as specified in config to deal with ip aliasing
- InetSocketAddress httpSocAddr = backupNode.getHttpAddress();
- int httpPort = httpSocAddr.getPort();
- String fileid = "putimage=1&port=" + httpPort +
- "&machine=" + infoBindAddress +
- "&token=" + sig.toString() +
- "&newChecksum=" + getFSImage().getStorage().getImageDigest().toString();
- LOG.info("Posted URL " + backupNode.nnHttpAddress + fileid);
- TransferFsImage.getFileClient(backupNode.nnHttpAddress,
- fileid, (File[])null, false);
+ private long countUncheckpointedTxns() throws IOException {
+ long curTxId = getNamenode().getTransactionID();
+ long uncheckpointedTxns = curTxId -
+ getFSImage().getStorage().getMostRecentCheckpointTxId();
+ assert uncheckpointedTxns >= 0;
+ return uncheckpointedTxns;
}
/**
* Create a new checkpoint
*/
void doCheckpoint() throws IOException {
+ BackupImage bnImage = getFSImage();
+ NNStorage bnStorage = bnImage.getStorage();
+
long startTime = now();
+ bnImage.freezeNamespaceAtNextRoll();
+
NamenodeCommand cmd =
getNamenode().startCheckpoint(backupNode.getRegistration());
CheckpointCommand cpCmd = null;
@@ -233,37 +197,94 @@ void doCheckpoint() throws IOException {
throw new IOException("Unsupported NamenodeCommand: "+cmd.getAction());
}
+ bnImage.waitUntilNamespaceFrozen();
+
CheckpointSignature sig = cpCmd.getSignature();
- assert FSConstants.LAYOUT_VERSION == sig.getLayoutVersion() :
- "Signature should have current layout version. Expected: "
- + FSConstants.LAYOUT_VERSION + " actual "+ sig.getLayoutVersion();
- assert !backupNode.isRole(NamenodeRole.CHECKPOINT) ||
- cpCmd.isImageObsolete() : "checkpoint node should always download image.";
- backupNode.setCheckpointState(CheckpointStates.UPLOAD_START);
- if(cpCmd.isImageObsolete()) {
- // First reset storage on disk and memory state
- backupNode.resetNamespace();
- downloadCheckpoint(sig);
+
+ // Make sure we're talking to the same NN!
+ sig.validateStorageInfo(bnImage);
+
+ long lastApplied = bnImage.getLastAppliedTxId();
+ LOG.debug("Doing checkpoint. Last applied: " + lastApplied);
+ RemoteEditLogManifest manifest =
+ getNamenode().getEditLogManifest(bnImage.getLastAppliedTxId());
+
+ if (!manifest.getLogs().isEmpty()) {
+ RemoteEditLog firstRemoteLog = manifest.getLogs().get(0);
+ // we don't have enough logs to roll forward using only logs. Need
+ // to download and load the image.
+ if (firstRemoteLog.getStartTxId() > lastApplied + 1) {
+ LOG.info("Unable to roll forward using only logs. Downloading " +
+ "image with txid " + sig.mostRecentCheckpointTxId);
+ MD5Hash downloadedHash = TransferFsImage.downloadImageToStorage(
+ backupNode.nnHttpAddress, sig.mostRecentCheckpointTxId,
+ bnStorage, true);
+ bnImage.saveDigestAndRenameCheckpointImage(
+ sig.mostRecentCheckpointTxId, downloadedHash);
+
+ LOG.info("Loading image with txid " + sig.mostRecentCheckpointTxId);
+ File file = bnStorage.findImageFile(sig.mostRecentCheckpointTxId);
+ bnImage.reloadFromImageFile(file);
+ }
+
+ lastApplied = bnImage.getLastAppliedTxId();
+ if (firstRemoteLog.getStartTxId() > lastApplied + 1) {
+ throw new IOException("No logs to roll forward from " + lastApplied);
+ }
+
+ // get edits files
+ for (RemoteEditLog log : manifest.getLogs()) {
+ TransferFsImage.downloadEditsToStorage(
+ backupNode.nnHttpAddress, log, bnStorage);
+ }
+
+ rollForwardByApplyingLogs(manifest, bnImage);
}
- BackupImage bnImage = getFSImage();
- bnImage.getStorage().setBlockPoolID(backupNode.getBlockPoolId());
- bnImage.getStorage().setClusterID(backupNode.getClusterId());
- bnImage.loadCheckpoint(sig);
- sig.validateStorageInfo(bnImage);
- bnImage.saveCheckpoint();
+ long txid = bnImage.getLastAppliedTxId();
+ bnImage.saveFSImageInAllDirs(txid);
+ bnStorage.writeAll();
- if(cpCmd.needToReturnImage())
- uploadCheckpoint(sig);
+ if(cpCmd.needToReturnImage()) {
+ TransferFsImage.uploadImageFromStorage(
+ backupNode.nnHttpAddress, getImageListenAddress(),
+ bnStorage, txid);
+ }
getNamenode().endCheckpoint(backupNode.getRegistration(), sig);
- bnImage.convergeJournalSpool();
+ if (backupNode.getRole() == NamenodeRole.BACKUP) {
+ bnImage.convergeJournalSpool();
+ }
backupNode.setRegistration(); // keep registration up to date
- if(backupNode.isRole(NamenodeRole.CHECKPOINT))
- getFSImage().getEditLog().close();
+
+ long imageSize = bnImage.getStorage().getFsImageName(txid).length();
LOG.info("Checkpoint completed in "
+ (now() - startTime)/1000 + " seconds."
- + " New Image Size: " + bnImage.getStorage().getFsImageName().length());
+ + " New Image Size: " + imageSize);
+ }
+
+ private InetSocketAddress getImageListenAddress() {
+ InetSocketAddress httpSocAddr = backupNode.getHttpAddress();
+ int httpPort = httpSocAddr.getPort();
+ return new InetSocketAddress(infoBindAddress, httpPort);
+ }
+
+ static void rollForwardByApplyingLogs(
+ RemoteEditLogManifest manifest,
+ FSImage dstImage) throws IOException {
+ NNStorage dstStorage = dstImage.getStorage();
+
+ List editsFiles = Lists.newArrayList();
+ for (RemoteEditLog log : manifest.getLogs()) {
+ File f = dstStorage.findFinalizedEditsFile(
+ log.getStartTxId(), log.getEndTxId());
+ if (log.getStartTxId() > dstImage.getLastAppliedTxId()) {
+ editsFiles.add(f);
+ }
+ }
+ LOG.info("Checkpointer about to load edits from " +
+ editsFiles.size() + " file(s).");
+ dstImage.loadEdits(editsFiles);
}
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java
index 81992586e2d..007578f28a6 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupInputStream.java
@@ -42,10 +42,6 @@ private static class ByteBufferInputStream extends ByteArrayInputStream {
super(new byte[0]);
}
- byte[] getData() {
- return super.buf;
- }
-
void setData(byte[] newBytes) {
super.buf = newBytes;
super.count = newBytes == null ? 0 : newBytes.length;
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java
index 5e9d5398ed6..5dd82393fd0 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogBackupOutputStream.java
@@ -19,10 +19,11 @@
import java.io.IOException;
import java.net.InetSocketAddress;
+import java.util.Arrays;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.server.common.Storage;
-import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.ipc.RPC;
@@ -39,7 +40,7 @@
class EditLogBackupOutputStream extends EditLogOutputStream {
static int DEFAULT_BUFFER_SIZE = 256;
- private NamenodeProtocol backupNode; // RPC proxy to backup node
+ private JournalProtocol backupNode; // RPC proxy to backup node
private NamenodeRegistration bnRegistration; // backup node registration
private NamenodeRegistration nnRegistration; // active node registration
private EditsDoubleBuffer doubleBuf;
@@ -56,8 +57,8 @@ class EditLogBackupOutputStream extends EditLogOutputStream {
Storage.LOG.info("EditLogBackupOutputStream connects to: " + bnAddress);
try {
this.backupNode =
- (NamenodeProtocol) RPC.getProxy(NamenodeProtocol.class,
- NamenodeProtocol.versionID, bnAddress, new HdfsConfiguration());
+ RPC.getProxy(JournalProtocol.class,
+ JournalProtocol.versionID, bnAddress, new HdfsConfiguration());
} catch(IOException e) {
Storage.LOG.error("Error connecting to: " + bnAddress, e);
throw e;
@@ -65,7 +66,7 @@ class EditLogBackupOutputStream extends EditLogOutputStream {
this.doubleBuf = new EditsDoubleBuffer(DEFAULT_BUFFER_SIZE);
this.out = new DataOutputBuffer(DEFAULT_BUFFER_SIZE);
}
-
+
@Override // JournalStream
public String getName() {
return bnRegistration.getAddress();
@@ -109,6 +110,12 @@ public void close() throws IOException {
doubleBuf = null;
}
+ @Override
+ public void abort() throws IOException {
+ RPC.stopProxy(backupNode);
+ doubleBuf = null;
+ }
+
@Override // EditLogOutputStream
void setReadyToFlush() throws IOException {
doubleBuf.setReadyToFlush();
@@ -116,11 +123,21 @@ void setReadyToFlush() throws IOException {
@Override // EditLogOutputStream
protected void flushAndSync() throws IOException {
- // XXX: this code won't work in trunk, but it's redone
- // in HDFS-1073 where it's simpler.
+ assert out.getLength() == 0 : "Output buffer is not empty";
+
+ int numReadyTxns = doubleBuf.countReadyTxns();
+ long firstTxToFlush = doubleBuf.getFirstReadyTxId();
+
doubleBuf.flushTo(out);
- if (out.size() > 0) {
- send(NamenodeProtocol.JA_JOURNAL);
+ if (out.getLength() > 0) {
+ assert numReadyTxns > 0;
+
+ byte[] data = Arrays.copyOf(out.getData(), out.getLength());
+ out.reset();
+ assert out.getLength() == 0 : "Output buffer is not empty";
+
+ backupNode.journal(nnRegistration,
+ firstTxToFlush, numReadyTxns, data);
}
}
@@ -134,16 +151,6 @@ long length() throws IOException {
return 0;
}
- private void send(int ja) throws IOException {
- try {
- int length = out.getLength();
- out.write(FSEditLogOpCodes.OP_INVALID.getOpCode());
- backupNode.journal(nnRegistration, ja, length, out.getData());
- } finally {
- out.reset();
- }
- }
-
/**
* Get backup node registration.
*/
@@ -151,17 +158,7 @@ NamenodeRegistration getRegistration() {
return bnRegistration;
}
- /**
- * Verify that the backup node is alive.
- */
- boolean isAlive() {
- try {
- send(NamenodeProtocol.JA_IS_ALIVE);
- } catch(IOException ei) {
- Storage.LOG.info(bnRegistration.getRole() + " "
- + bnRegistration.getAddress() + " is not alive. ", ei);
- return false;
- }
- return true;
+ void startLogSegment(long txId) throws IOException {
+ backupNode.startLogSegment(nnRegistration, txId);
}
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java
index b27b8cfbc4e..2ae0bd038f9 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileInputStream.java
@@ -70,4 +70,10 @@ long length() throws IOException {
// file size + size of both buffers
return file.length();
}
+
+ @Override
+ public String toString() {
+ return getName();
+ }
+
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java
index 44130015ad9..f79f44266e0 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogFileOutputStream.java
@@ -25,6 +25,8 @@
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.io.IOUtils;
@@ -35,6 +37,8 @@
* stores edits in a local file.
*/
class EditLogFileOutputStream extends EditLogOutputStream {
+ private static Log LOG = LogFactory.getLog(EditLogFileOutputStream.class);;
+
private static int EDITS_FILE_HEADER_SIZE_BYTES = Integer.SIZE / Byte.SIZE;
private File file;
@@ -85,7 +89,14 @@ void write(FSEditLogOp op) throws IOException {
doubleBuf.writeOp(op);
}
- /** {@inheritDoc} */
+ /**
+ * Write a transaction to the stream. The serialization format is:
+ *
+ * - the opcode (byte)
+ * - the transaction id (long)
+ * - the actual Writables for the transaction
+ *
+ * */
@Override
void writeRaw(byte[] bytes, int offset, int length) throws IOException {
doubleBuf.writeRaw(bytes, offset, length);
@@ -105,6 +116,10 @@ void create() throws IOException {
@Override
public void close() throws IOException {
+ if (fp == null) {
+ throw new IOException("Trying to use aborted output stream");
+ }
+
try {
// close should have been called after all pending transactions
// have been flushed & synced.
@@ -130,6 +145,16 @@ public void close() throws IOException {
fc = null;
fp = null;
}
+ fp = null;
+ }
+
+ @Override
+ public void abort() throws IOException {
+ if (fp == null) {
+ return;
+ }
+ IOUtils.cleanup(LOG, fp);
+ fp = null;
}
/**
@@ -148,6 +173,10 @@ void setReadyToFlush() throws IOException {
*/
@Override
protected void flushAndSync() throws IOException {
+ if (fp == null) {
+ throw new IOException("Trying to use aborted output stream");
+ }
+
preallocate(); // preallocate file if necessary
doubleBuf.flushTo(fp);
fc.force(false); // metadata updates not needed because of preallocation
@@ -189,21 +218,19 @@ private void preallocate() throws IOException {
}
}
- /**
- * Operations like OP_JSPOOL_START and OP_CHECKPOINT_TIME should not be
- * written into edits file.
- */
- @Override
- boolean isOperationSupported(byte op) {
- return op < FSEditLogOpCodes.OP_JSPOOL_START.getOpCode() - 1;
- }
-
/**
* Returns the file associated with this stream.
*/
File getFile() {
return file;
}
+
+ /**
+ * @return true if this stream is currently open.
+ */
+ public boolean isOpen() {
+ return fp != null;
+ }
@VisibleForTesting
public void setFileChannelForTesting(FileChannel fc) {
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java
index eb16d692928..8577db8e457 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditLogOutputStream.java
@@ -18,9 +18,13 @@
package org.apache.hadoop.hdfs.server.namenode;
import java.io.IOException;
+import java.util.zip.Checksum;
import static org.apache.hadoop.hdfs.server.common.Util.now;
+import org.apache.hadoop.io.DataOutputBuffer;
+import org.apache.hadoop.io.Writable;
+
/**
* A generic abstract class to support journaling of edits logs into
* a persistent storage.
@@ -63,9 +67,19 @@ abstract void writeRaw(byte[] bytes, int offset, int length)
*/
abstract void create() throws IOException;
- /** {@inheritDoc} */
+ /**
+ * Close the journal.
+ * @throws IOException if the journal can't be closed,
+ * or if there are unflushed edits
+ */
abstract public void close() throws IOException;
+ /**
+ * Close the stream without necessarily flushing any pending data.
+ * This may be called after a previous write or close threw an exception.
+ */
+ abstract public void abort() throws IOException;
+
/**
* All data that has been written to the stream so far will be flushed.
* New data can be still written to the stream while flushing is performed.
@@ -108,10 +122,6 @@ public boolean shouldForceSync() {
return false;
}
- boolean isOperationSupported(byte op) {
- return true;
- }
-
/**
* Return total time spent in {@link #flushAndSync()}
*/
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java
index 60cb9e6dba7..0dd90588f44 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/EditsDoubleBuffer.java
@@ -20,6 +20,7 @@
import java.io.IOException;
import java.io.OutputStream;
+import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.Writer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.IOUtils;
@@ -35,20 +36,19 @@
*/
class EditsDoubleBuffer {
- private DataOutputBuffer bufCurrent; // current buffer for writing
- private DataOutputBuffer bufReady; // buffer ready for flushing
+ private TxnBuffer bufCurrent; // current buffer for writing
+ private TxnBuffer bufReady; // buffer ready for flushing
private final int initBufferSize;
- private Writer writer;
public EditsDoubleBuffer(int defaultBufferSize) {
initBufferSize = defaultBufferSize;
- bufCurrent = new DataOutputBuffer(initBufferSize);
- bufReady = new DataOutputBuffer(initBufferSize);
- writer = new FSEditLogOp.Writer(bufCurrent);
+ bufCurrent = new TxnBuffer(initBufferSize);
+ bufReady = new TxnBuffer(initBufferSize);
+
}
public void writeOp(FSEditLogOp op) throws IOException {
- writer.writeOp(op);
+ bufCurrent.writeOp(op);
}
void writeRaw(byte[] bytes, int offset, int length) throws IOException {
@@ -71,10 +71,9 @@ void close() throws IOException {
void setReadyToFlush() {
assert isFlushed() : "previous data not flushed yet";
- DataOutputBuffer tmp = bufReady;
+ TxnBuffer tmp = bufReady;
bufReady = bufCurrent;
bufCurrent = tmp;
- writer = new FSEditLogOp.Writer(bufCurrent);
}
/**
@@ -102,4 +101,50 @@ public int countBufferedBytes() {
return bufReady.size() + bufCurrent.size();
}
+ /**
+ * @return the transaction ID of the first transaction ready to be flushed
+ */
+ public long getFirstReadyTxId() {
+ assert bufReady.firstTxId > 0;
+ return bufReady.firstTxId;
+ }
+
+ /**
+ * @return the number of transactions that are ready to be flushed
+ */
+ public int countReadyTxns() {
+ return bufReady.numTxns;
+ }
+
+
+ private static class TxnBuffer extends DataOutputBuffer {
+ long firstTxId;
+ int numTxns;
+ private Writer writer;
+
+ public TxnBuffer(int initBufferSize) {
+ super(initBufferSize);
+ writer = new FSEditLogOp.Writer(this);
+ reset();
+ }
+
+ public void writeOp(FSEditLogOp op) throws IOException {
+ if (firstTxId == FSConstants.INVALID_TXID) {
+ firstTxId = op.txid;
+ } else {
+ assert op.txid > firstTxId;
+ }
+ writer.writeOp(op);
+ numTxns++;
+ }
+
+ @Override
+ public DataOutputBuffer reset() {
+ super.reset();
+ firstTxId = FSConstants.INVALID_TXID;
+ numTxns = 0;
+ return this;
+ }
+ }
+
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
index 6ac35efa103..7c59bdf5722 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSDirectory.java
@@ -22,9 +22,7 @@
import java.io.Closeable;
import java.io.FileNotFoundException;
import java.io.IOException;
-import java.net.URI;
import java.util.ArrayList;
-import java.util.Collection;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
@@ -157,26 +155,33 @@ private BlockManager getBlockManager() {
return getFSNamesystem().getBlockManager();
}
- void loadFSImage(Collection dataDirs,
- Collection editsDirs,
- StartupOption startOpt)
+ /**
+ * Load the filesystem image into memory.
+ *
+ * @param startOpt Startup type as specified by the user.
+ * @throws IOException If image or editlog cannot be read.
+ */
+ void loadFSImage(StartupOption startOpt)
throws IOException {
// format before starting up if requested
if (startOpt == StartupOption.FORMAT) {
- fsImage.getStorage().setStorageDirectories(dataDirs, editsDirs);
- fsImage.getStorage().format(fsImage.getStorage().determineClusterId()); // reuse current id
+ fsImage.format(fsImage.getStorage().determineClusterId());// reuse current id
+
startOpt = StartupOption.REGULAR;
}
+ boolean success = false;
try {
- if (fsImage.recoverTransitionRead(dataDirs, editsDirs, startOpt)) {
- fsImage.saveNamespace(true);
+ if (fsImage.recoverTransitionRead(startOpt)) {
+ fsImage.saveNamespace();
}
- FSEditLog editLog = fsImage.getEditLog();
- assert editLog != null : "editLog must be initialized";
+ fsImage.openEditLog();
+
fsImage.setCheckpointDirectories(null, null);
- } catch(IOException e) {
- fsImage.close();
- throw e;
+ success = true;
+ } finally {
+ if (!success) {
+ fsImage.close();
+ }
}
writeLock();
try {
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
index 0321d20b87b..bf137ae9c62 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java
@@ -17,10 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
-import java.io.File;
import java.io.IOException;
-import java.io.DataOutputStream;
-import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.zip.Checksum;
@@ -33,22 +30,26 @@
import org.apache.hadoop.fs.Options;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.protocol.Block;
+import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
-import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import static org.apache.hadoop.hdfs.server.common.Util.now;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NNStorageListener;
-import org.apache.hadoop.hdfs.server.namenode.JournalStream.JournalType;
+import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger;
import org.apache.hadoop.hdfs.server.namenode.metrics.NameNodeMetrics;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
+import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.security.token.delegation.DelegationKey;
import org.apache.hadoop.util.PureJavaCrc32;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.*;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.*;
@@ -58,23 +59,42 @@
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
-public class FSEditLog implements NNStorageListener {
+public class FSEditLog {
static final String NO_JOURNAL_STREAMS_WARNING = "!!! WARNING !!!" +
" File system changes are not persistent. No journal streams.";
- private static final Log LOG = LogFactory.getLog(FSEditLog.class);
+ static final Log LOG = LogFactory.getLog(FSEditLog.class);
- private volatile int sizeOutputFlushBuffer = 512*1024;
+ /**
+ * State machine for edit log.
+ * The log starts in UNITIALIZED state upon construction. Once it's
+ * initialized, it is usually in IN_SEGMENT state, indicating that edits
+ * may be written. In the middle of a roll, or while saving the namespace,
+ * it briefly enters the BETWEEN_LOG_SEGMENTS state, indicating that the
+ * previous segment has been closed, but the new one has not yet been opened.
+ */
+ private enum State {
+ UNINITIALIZED,
+ BETWEEN_LOG_SEGMENTS,
+ IN_SEGMENT,
+ CLOSED;
+ }
+ private State state = State.UNINITIALIZED;
- private ArrayList editStreams = null;
+ private List journals = Lists.newArrayList();
+
// a monotonically increasing counter that represents transactionIds.
private long txid = 0;
// stores the last synced transactionId.
private long synctxid = 0;
+ // the first txid of the log that's currently open for writing.
+ // If this value is N, we are currently writing to edits_inprogress_N
+ private long curSegmentTxId = FSConstants.INVALID_TXID;
+
// the time of printing the statistics to the log file.
private long lastPrintTime;
@@ -83,6 +103,10 @@ public class FSEditLog implements NNStorageListener {
// is an automatic sync scheduled?
private volatile boolean isAutoSyncScheduled = false;
+
+ // Used to exit in the event of a failure to sync to all journals. It's a
+ // member variable so it can be swapped out for testing.
+ private Runtime runtime = Runtime.getRuntime();
// these are statistics counters.
private long numTransactions; // number of transactions
@@ -122,226 +146,90 @@ protected synchronized TransactionId initialValue() {
FSEditLog(NNStorage storage) {
isSyncRunning = false;
this.storage = storage;
- this.storage.registerListener(this);
metrics = NameNode.getNameNodeMetrics();
lastPrintTime = now();
}
- private File getEditFile(StorageDirectory sd) {
- return storage.getEditFile(sd);
- }
-
- private File getEditNewFile(StorageDirectory sd) {
- return storage.getEditNewFile(sd);
- }
-
- private int getNumEditsDirs() {
- return storage.getNumStorageDirs(NameNodeDirType.EDITS);
- }
-
- synchronized int getNumEditStreams() {
- return editStreams == null ? 0 : editStreams.size();
- }
-
/**
- * Return the currently active edit streams.
- * This should be used only by unit tests.
+ * Initialize the list of edit journals
*/
- ArrayList getEditStreams() {
- return editStreams;
+ synchronized void initJournals() {
+ assert journals.isEmpty();
+ Preconditions.checkState(state == State.UNINITIALIZED,
+ "Bad state: %s", state);
+
+ for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.EDITS)) {
+ journals.add(new JournalAndStream(new FileJournalManager(sd)));
+ }
+
+ if (journals.isEmpty()) {
+ LOG.error("No edits directories configured!");
+ }
+
+ state = State.BETWEEN_LOG_SEGMENTS;
}
-
- boolean isOpen() {
- return getNumEditStreams() > 0;
- }
-
+
/**
- * Create empty edit log files.
- * Initialize the output stream for logging.
- *
- * @throws IOException
+ * Initialize the output stream for logging, opening the first
+ * log segment.
*/
synchronized void open() throws IOException {
- numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0;
- if (editStreams == null)
- editStreams = new ArrayList();
-
- ArrayList al = null;
- for (Iterator it
- = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
- StorageDirectory sd = it.next();
- File eFile = getEditFile(sd);
- try {
- addNewEditLogStream(eFile);
- } catch (IOException e) {
- LOG.warn("Unable to open edit log file " + eFile);
- // Remove the directory from list of storage directories
- if(al == null) al = new ArrayList(1);
- al.add(sd);
- }
- }
-
- if (al != null)
- storage.reportErrorsOnDirectories(al);
-
- // If there was an error in every storage dir, each one will have
- // been removed from the list of storage directories.
- if (storage.getNumStorageDirs(NameNodeDirType.EDITS) == 0) {
- throw new IOException(
- "Failed to initialize edits log in any storage directory.");
- }
+ Preconditions.checkState(state == State.UNINITIALIZED);
+ initJournals();
+
+ startLogSegment(getLastWrittenTxId() + 1, true);
+ assert state == State.IN_SEGMENT : "Bad state: " + state;
}
-
- synchronized void addNewEditLogStream(File eFile) throws IOException {
- EditLogOutputStream eStream = new EditLogFileOutputStream(eFile,
- sizeOutputFlushBuffer);
- editStreams.add(eStream);
- }
-
- synchronized void createEditLogFile(File name) throws IOException {
- waitForSyncToFinish();
-
- EditLogOutputStream eStream = new EditLogFileOutputStream(name,
- sizeOutputFlushBuffer);
- eStream.create();
- eStream.close();
+ synchronized boolean isOpen() {
+ return state == State.IN_SEGMENT;
}
/**
* Shutdown the file store.
*/
synchronized void close() {
- waitForSyncToFinish();
- if (editStreams == null || editStreams.isEmpty()) {
+ if (state == State.CLOSED) {
+ LOG.warn("Closing log when already closed", new Exception());
return;
}
- printStatistics(true);
- numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0;
-
- ArrayList errorStreams = null;
- Iterator it = getOutputStreamIterator(null);
- while(it.hasNext()) {
- EditLogOutputStream eStream = it.next();
- try {
- closeStream(eStream);
- } catch (IOException e) {
- LOG.warn("FSEditLog:close - failed to close stream "
- + eStream.getName());
- if(errorStreams == null)
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
- }
- }
- disableAndReportErrorOnStreams(errorStreams);
- editStreams.clear();
- }
-
- /**
- * Close and remove edit log stream.
- * @param index of the stream
- */
- synchronized private void removeStream(int index) {
- EditLogOutputStream eStream = editStreams.get(index);
- try {
- eStream.close();
- } catch (Exception e) {}
- editStreams.remove(index);
- }
-
- /**
- * The specified streams have IO errors. Close and remove them.
- */
- synchronized
- void disableAndReportErrorOnStreams(List errorStreams) {
- if (errorStreams == null || errorStreams.size() == 0) {
- return; // nothing to do
- }
- ArrayList errorDirs = new ArrayList();
- for (EditLogOutputStream e : errorStreams) {
- if (e.getType() == JournalType.FILE) {
- errorDirs.add(getStorageDirectoryForStream(e));
- } else {
- disableStream(e);
- }
+
+ if (state == State.IN_SEGMENT) {
+ assert !journals.isEmpty();
+ waitForSyncToFinish();
+ endCurrentLogSegment(true);
}
- try {
- storage.reportErrorsOnDirectories(errorDirs);
- } catch (IOException ioe) {
- LOG.error("Problem erroring streams " + ioe);
- }
- }
-
-
- /**
- * get an editStream corresponding to a sd
- * @param es - stream to remove
- * @return the matching stream
- */
- StorageDirectory getStorage(EditLogOutputStream es) {
- String parentStorageDir = ((EditLogFileOutputStream)es).getFile()
- .getParentFile().getParentFile().getAbsolutePath();
-
- Iterator it = storage.dirIterator();
- while (it.hasNext()) {
- StorageDirectory sd = it.next();
- LOG.info("comparing: " + parentStorageDir + " and " + sd.getRoot().getAbsolutePath());
- if (parentStorageDir.equals(sd.getRoot().getAbsolutePath()))
- return sd;
- }
- return null;
- }
-
- /**
- * get an editStream corresponding to a sd
- * @param sd
- * @return the matching stream
- */
- synchronized EditLogOutputStream getEditsStream(StorageDirectory sd) {
- for (EditLogOutputStream es : editStreams) {
- File parentStorageDir = ((EditLogFileOutputStream)es).getFile()
- .getParentFile().getParentFile();
- if (parentStorageDir.getName().equals(sd.getRoot().getName()))
- return es;
- }
- return null;
- }
-
- /**
- * check if edits.new log exists in the specified stoorage directory
- */
- boolean existsNew(StorageDirectory sd) {
- return getEditNewFile(sd).exists();
+ state = State.CLOSED;
}
/**
* Write an operation to the edit log. Do not sync to persistent
* store yet.
*/
- void logEdit(FSEditLogOp op) {
+ void logEdit(final FSEditLogOp op) {
synchronized (this) {
+ assert state != State.CLOSED;
+
// wait if an automatic sync is scheduled
waitIfAutoSyncScheduled();
- if(getNumEditStreams() == 0)
+ if (journals.isEmpty()) {
throw new java.lang.IllegalStateException(NO_JOURNAL_STREAMS_WARNING);
- ArrayList errorStreams = null;
- long start = now();
- for(EditLogOutputStream eStream : editStreams) {
- if(!eStream.isOperationSupported(op.opCode.getOpCode()))
- continue;
- try {
- eStream.write(op);
- } catch (IOException ie) {
- LOG.error("logEdit: removing "+ eStream.getName(), ie);
- if(errorStreams == null)
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
- }
}
- disableAndReportErrorOnStreams(errorStreams);
- recordTransaction(start);
+
+ long start = beginTransaction();
+ op.setTransactionId(txid);
+
+ mapJournalsAndReportErrors(new JournalClosure() {
+ @Override
+ public void apply(JournalAndStream jas) throws IOException {
+ if (!jas.isActive()) return;
+ jas.stream.write(op);
+ }
+ }, "logging edit");
+
+ endTransaction(start);
// check if it is time to schedule an automatic sync
if (!shouldForceSync()) {
@@ -384,15 +272,18 @@ synchronized void doneWithAutoSyncScheduling() {
* @return true if any of the edit stream says that it should sync
*/
private boolean shouldForceSync() {
- for (EditLogOutputStream eStream : editStreams) {
- if (eStream.shouldForceSync()) {
+ for (JournalAndStream jas : journals) {
+ if (!jas.isActive()) continue;
+
+ if (jas.getCurrentStream().shouldForceSync()) {
return true;
}
}
return false;
}
- private void recordTransaction(long start) {
+ private long beginTransaction() {
+ assert Thread.holdsLock(this);
// get a new transactionId
txid++;
@@ -401,7 +292,12 @@ private void recordTransaction(long start) {
//
TransactionId id = myTransactionId.get();
id.txid = txid;
-
+ return now();
+ }
+
+ private void endTransaction(long start) {
+ assert Thread.holdsLock(this);
+
// update statistics
long end = now();
numTransactions++;
@@ -410,6 +306,35 @@ private void recordTransaction(long start) {
metrics.addTransaction(end-start);
}
+ /**
+ * Return the transaction ID of the last transaction written to the log.
+ */
+ synchronized long getLastWrittenTxId() {
+ return txid;
+ }
+
+ /**
+ * @return the first transaction ID in the current log segment
+ */
+ synchronized long getCurSegmentTxId() {
+ Preconditions.checkState(state == State.IN_SEGMENT,
+ "Bad state: %s", state);
+ return curSegmentTxId;
+ }
+
+ /**
+ * Set the transaction ID to use for the next transaction written.
+ */
+ synchronized void setNextTxId(long nextTxId) {
+ Preconditions.checkArgument(synctxid <= txid &&
+ nextTxId >= txid,
+ "May not decrease txid." +
+ " synctxid=%s txid=%s nextTxId=%s",
+ synctxid, txid, nextTxId);
+
+ txid = nextTxId - 1;
+ }
+
/**
* Blocks until all ongoing edits have been synced to disk.
* This differs from logSync in that it waits for edits that have been
@@ -457,12 +382,15 @@ void logSyncAll() throws IOException {
* waitForSyncToFinish() before assuming they are running alone.
*/
public void logSync() {
- ArrayList errorStreams = null;
long syncStart = 0;
// Fetch the transactionId of this thread.
long mytxid = myTransactionId.get().txid;
- ArrayList streams = new ArrayList();
+
+ List candidateJournals =
+ Lists.newArrayListWithCapacity(journals.size());
+ List badJournals = Lists.newArrayList();
+
boolean sync = false;
try {
synchronized (this) {
@@ -493,20 +421,16 @@ public void logSync() {
sync = true;
// swap buffers
- assert editStreams.size() > 0 : "no editlog streams";
- for(EditLogOutputStream eStream : editStreams) {
+ assert !journals.isEmpty() : "no editlog streams";
+
+ for (JournalAndStream jas : journals) {
+ if (!jas.isActive()) continue;
try {
- eStream.setReadyToFlush();
- streams.add(eStream);
+ jas.getCurrentStream().setReadyToFlush();
+ candidateJournals.add(jas);
} catch (IOException ie) {
LOG.error("Unable to get ready to flush.", ie);
- //
- // remember the streams that encountered an error.
- //
- if (errorStreams == null) {
- errorStreams = new ArrayList(1);
- }
- errorStreams.add(eStream);
+ badJournals.add(jas);
}
}
} finally {
@@ -517,29 +441,36 @@ public void logSync() {
// do the sync
long start = now();
- for (EditLogOutputStream eStream : streams) {
+ for (JournalAndStream jas : candidateJournals) {
+ if (!jas.isActive()) continue;
try {
- eStream.flush();
+ jas.getCurrentStream().flush();
} catch (IOException ie) {
LOG.error("Unable to sync edit log.", ie);
//
// remember the streams that encountered an error.
//
- if (errorStreams == null) {
- errorStreams = new ArrayList(1);
- }
- errorStreams.add(eStream);
+ badJournals.add(jas);
}
}
long elapsed = now() - start;
- disableAndReportErrorOnStreams(errorStreams);
+ disableAndReportErrorOnJournals(badJournals);
- if (metrics != null) // Metrics non-null only when used inside name node
+ if (metrics != null) { // Metrics non-null only when used inside name node
metrics.addSync(elapsed);
+ }
+
} finally {
// Prevent RuntimeException from blocking other log edit sync
synchronized (this) {
if (sync) {
+ if (badJournals.size() >= journals.size()) {
+ LOG.fatal("Could not sync any journal to persistent storage. " +
+ "Unsynced transactions: " + (txid - synctxid),
+ new Exception());
+ runtime.exit(1);
+ }
+
synctxid = syncStart;
isSyncRunning = false;
}
@@ -556,7 +487,7 @@ private void printStatistics(boolean force) {
if (lastPrintTime + 60000 > now && !force) {
return;
}
- if (editStreams == null || editStreams.size()==0) {
+ if (journals.isEmpty()) {
return;
}
lastPrintTime = now;
@@ -568,12 +499,17 @@ private void printStatistics(boolean force) {
buf.append("Number of transactions batched in Syncs: ");
buf.append(numTransactionsBatchedInSync);
buf.append(" Number of syncs: ");
- buf.append(editStreams.get(0).getNumSync());
+ for (JournalAndStream jas : journals) {
+ if (!jas.isActive()) continue;
+ buf.append(jas.getCurrentStream().getNumSync());
+ break;
+ }
+
buf.append(" SyncTimes(ms): ");
- int numEditStreams = editStreams.size();
- for (int idx = 0; idx < numEditStreams; idx++) {
- EditLogOutputStream eStream = editStreams.get(idx);
+ for (JournalAndStream jas : journals) {
+ if (!jas.isActive()) continue;
+ EditLogOutputStream eStream = jas.getCurrentStream();
buf.append(eStream.getTotalSyncTime());
buf.append(" ");
}
@@ -788,112 +724,180 @@ void logReassignLease(String leaseHolder, String src, String newHolder) {
}
/**
- * Return the size of the current EditLog
+ * @return the number of active (non-failed) journals
*/
- synchronized long getEditLogSize() throws IOException {
- assert getNumEditsDirs() <= getNumEditStreams() :
- "Number of edits directories should not exceed the number of streams.";
- long size = 0;
- ArrayList al = null;
- for (int idx = 0; idx < getNumEditStreams(); idx++) {
- EditLogOutputStream es = editStreams.get(idx);
- try {
- long curSize = es.length();
- assert (size == 0 || size == curSize || curSize ==0) :
- "Wrong streams size";
- size = Math.max(size, curSize);
- } catch (IOException e) {
- LOG.error("getEditLogSize: editstream.length failed. removing editlog (" +
- idx + ") " + es.getName());
- if(al==null) al = new ArrayList(1);
- al.add(es);
+ private int countActiveJournals() {
+ int count = 0;
+ for (JournalAndStream jas : journals) {
+ if (jas.isActive()) {
+ count++;
}
}
- if(al!=null) disableAndReportErrorOnStreams(al);
- return size;
+ return count;
}
/**
- * Closes the current edit log and opens edits.new.
+ * Used only by unit tests.
*/
- synchronized void rollEditLog() throws IOException {
- waitForSyncToFinish();
- Iterator it = storage.dirIterator(NameNodeDirType.EDITS);
- if(!it.hasNext())
- return;
- //
- // If edits.new already exists in some directory, verify it
- // exists in all directories.
- //
- boolean alreadyExists = existsNew(it.next());
- while(it.hasNext()) {
- StorageDirectory sd = it.next();
- if(alreadyExists != existsNew(sd))
- throw new IOException(getEditNewFile(sd)
- + "should " + (alreadyExists ? "" : "not ") + "exist.");
- }
- if(alreadyExists)
- return; // nothing to do, edits.new exists!
+ @VisibleForTesting
+ List getJournals() {
+ return journals;
+ }
+
+ /**
+ * Used only by unit tests.
+ */
+ @VisibleForTesting
+ synchronized void setRuntimeForTesting(Runtime runtime) {
+ this.runtime = runtime;
+ }
+
+ /**
+ * Return a manifest of what finalized edit logs are available
+ */
+ public RemoteEditLogManifest getEditLogManifest(long sinceTxId)
+ throws IOException {
+ FSImageTransactionalStorageInspector inspector =
+ new FSImageTransactionalStorageInspector();
- // check if any of failed storage is now available and put it back
+ for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.EDITS)) {
+ inspector.inspectDirectory(sd);
+ }
+
+ return inspector.getEditLogManifest(sinceTxId);
+ }
+
+ /**
+ * Finalizes the current edit log and opens a new log segment.
+ * @return the transaction id of the BEGIN_LOG_SEGMENT transaction
+ * in the new log.
+ */
+ synchronized long rollEditLog() throws IOException {
+ LOG.info("Rolling edit logs.");
+ endCurrentLogSegment(true);
+
+ long nextTxId = getLastWrittenTxId() + 1;
+ startLogSegment(nextTxId, true);
+
+ assert curSegmentTxId == nextTxId;
+ return nextTxId;
+ }
+
+ /**
+ * Start writing to the log segment with the given txid.
+ * Transitions from BETWEEN_LOG_SEGMENTS state to IN_LOG_SEGMENT state.
+ */
+ synchronized void startLogSegment(final long segmentTxId,
+ boolean writeHeaderTxn) throws IOException {
+ LOG.info("Starting log segment at " + segmentTxId);
+ Preconditions.checkArgument(segmentTxId > 0,
+ "Bad txid: %s", segmentTxId);
+ Preconditions.checkState(state == State.BETWEEN_LOG_SEGMENTS,
+ "Bad state: %s", state);
+ Preconditions.checkState(segmentTxId > curSegmentTxId,
+ "Cannot start writing to log segment " + segmentTxId +
+ " when previous log segment started at " + curSegmentTxId);
+ Preconditions.checkArgument(segmentTxId == txid + 1,
+ "Cannot start log segment at txid %s when next expected " +
+ "txid is %s", segmentTxId, txid + 1);
+
+ numTransactions = totalTimeTransactions = numTransactionsBatchedInSync = 0;
+
+ // TODO no need to link this back to storage anymore!
+ // See HDFS-2174.
storage.attemptRestoreRemovedStorage();
-
- divertFileStreams(
- Storage.STORAGE_DIR_CURRENT + "/" + NameNodeFile.EDITS_NEW.getName());
- }
-
- /**
- * Divert file streams from file edits to file edits.new.
- * Close file streams, which are currently writing into edits files.
- * Create new streams based on file getRoot()/dest.
- * @param dest new stream path relative to the storage directory root.
- * @throws IOException
- */
- synchronized void divertFileStreams(String dest) throws IOException {
- waitForSyncToFinish();
-
- assert getNumEditStreams() >= getNumEditsDirs() :
- "Inconsistent number of streams";
- ArrayList errorStreams = null;
- EditStreamIterator itE =
- (EditStreamIterator)getOutputStreamIterator(JournalType.FILE);
- Iterator itD =
- storage.dirIterator(NameNodeDirType.EDITS);
- while(itE.hasNext() && itD.hasNext()) {
- EditLogOutputStream eStream = itE.next();
- StorageDirectory sd = itD.next();
- if(!eStream.getName().startsWith(sd.getRoot().getPath()))
- throw new IOException("Inconsistent order of edit streams: " + eStream);
- try {
- // close old stream
- closeStream(eStream);
- // create new stream
- eStream = new EditLogFileOutputStream(new File(sd.getRoot(), dest),
- sizeOutputFlushBuffer);
- eStream.create();
- // replace by the new stream
- itE.replace(eStream);
- } catch (IOException e) {
- LOG.warn("Error in editStream " + eStream.getName(), e);
- if(errorStreams == null)
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
+
+ mapJournalsAndReportErrors(new JournalClosure() {
+ @Override
+ public void apply(JournalAndStream jas) throws IOException {
+ jas.startLogSegment(segmentTxId);
}
+ }, "starting log segment " + segmentTxId);
+
+ if (countActiveJournals() == 0) {
+ throw new IOException("Unable to start log segment " +
+ segmentTxId + ": no journals successfully started.");
+ }
+
+ curSegmentTxId = segmentTxId;
+ state = State.IN_SEGMENT;
+
+ if (writeHeaderTxn) {
+ logEdit(LogSegmentOp.getInstance(
+ FSEditLogOpCodes.OP_START_LOG_SEGMENT));
+ logSync();
}
- disableAndReportErrorOnStreams(errorStreams);
}
/**
- * Removes the old edit log and renames edits.new to edits.
- * Reopens the edits file.
+ * Finalize the current log segment.
+ * Transitions from IN_SEGMENT state to BETWEEN_LOG_SEGMENTS state.
*/
- synchronized void purgeEditLog() throws IOException {
- waitForSyncToFinish();
- revertFileStreams(
- Storage.STORAGE_DIR_CURRENT + "/" + NameNodeFile.EDITS_NEW.getName());
+ synchronized void endCurrentLogSegment(boolean writeEndTxn) {
+ LOG.info("Ending log segment " + curSegmentTxId);
+ Preconditions.checkState(state == State.IN_SEGMENT,
+ "Bad state: %s", state);
+
+ if (writeEndTxn) {
+ logEdit(LogSegmentOp.getInstance(
+ FSEditLogOpCodes.OP_END_LOG_SEGMENT));
+ logSync();
+ }
+
+ printStatistics(true);
+
+ final long lastTxId = getLastWrittenTxId();
+
+ mapJournalsAndReportErrors(new JournalClosure() {
+ @Override
+ public void apply(JournalAndStream jas) throws IOException {
+ if (jas.isActive()) {
+ jas.close(lastTxId);
+ }
+ }
+ }, "ending log segment");
+
+ state = State.BETWEEN_LOG_SEGMENTS;
+ }
+
+ /**
+ * Abort all current logs. Called from the backup node.
+ */
+ synchronized void abortCurrentLogSegment() {
+ mapJournalsAndReportErrors(new JournalClosure() {
+
+ @Override
+ public void apply(JournalAndStream jas) throws IOException {
+ jas.abort();
+ }
+ }, "aborting all streams");
+ state = State.BETWEEN_LOG_SEGMENTS;
}
+ /**
+ * Archive any log files that are older than the given txid.
+ */
+ public void purgeLogsOlderThan(
+ final long minTxIdToKeep, final StoragePurger purger) {
+ synchronized (this) {
+ // synchronized to prevent findbugs warning about inconsistent
+ // synchronization. This will be JIT-ed out if asserts are
+ // off.
+ assert curSegmentTxId == FSConstants.INVALID_TXID || // on format this is no-op
+ minTxIdToKeep <= curSegmentTxId :
+ "cannot purge logs older than txid " + minTxIdToKeep +
+ " when current segment starts at " + curSegmentTxId;
+ }
+
+ mapJournalsAndReportErrors(new JournalClosure() {
+ @Override
+ public void apply(JournalAndStream jas) throws IOException {
+ jas.manager.purgeLogsOlderThan(minTxIdToKeep, purger);
+ }
+ }, "purging logs older than " + minTxIdToKeep);
+ }
+
/**
* The actual sync activity happens while not synchronized on this object.
* Thus, synchronized activities that require that they are not concurrent
@@ -907,85 +911,6 @@ synchronized void waitForSyncToFinish() {
}
}
- /**
- * Revert file streams from file edits.new back to file edits.
- * Close file streams, which are currently writing into getRoot()/source.
- * Rename getRoot()/source to edits.
- * Reopen streams so that they start writing into edits files.
- * @param dest new stream path relative to the storage directory root.
- * @throws IOException
- */
- synchronized void revertFileStreams(String source) throws IOException {
- waitForSyncToFinish();
-
- assert getNumEditStreams() >= getNumEditsDirs() :
- "Inconsistent number of streams";
- ArrayList errorStreams = null;
- EditStreamIterator itE =
- (EditStreamIterator)getOutputStreamIterator(JournalType.FILE);
- Iterator itD =
- storage.dirIterator(NameNodeDirType.EDITS);
- while(itE.hasNext() && itD.hasNext()) {
- EditLogOutputStream eStream = itE.next();
- StorageDirectory sd = itD.next();
- if(!eStream.getName().startsWith(sd.getRoot().getPath()))
- throw new IOException("Inconsistent order of edit streams: " + eStream +
- " does not start with " + sd.getRoot().getPath());
- try {
- // close old stream
- closeStream(eStream);
- // rename edits.new to edits
- File editFile = getEditFile(sd);
- File prevEditFile = new File(sd.getRoot(), source);
- if(prevEditFile.exists()) {
- if(!prevEditFile.renameTo(editFile)) {
- //
- // renameTo() fails on Windows if the destination
- // file exists.
- //
- if(!editFile.delete() || !prevEditFile.renameTo(editFile)) {
- throw new IOException("Rename failed for " + sd.getRoot());
- }
- }
- }
- // open new stream
- eStream = new EditLogFileOutputStream(editFile, sizeOutputFlushBuffer);
- // replace by the new stream
- itE.replace(eStream);
- } catch (IOException e) {
- LOG.warn("Error in editStream " + eStream.getName(), e);
- if(errorStreams == null)
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
- }
- }
- disableAndReportErrorOnStreams(errorStreams);
- }
-
- /**
- * Return the name of the edit file
- */
- synchronized File getFsEditName() {
- StorageDirectory sd = null;
- for (Iterator it =
- storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
- sd = it.next();
- if(sd.getRoot().canRead())
- return getEditFile(sd);
- }
- return null;
- }
-
- /**
- * Returns the timestamp of the edit log
- */
- synchronized long getFsEditTime() {
- Iterator it = storage.dirIterator(NameNodeDirType.EDITS);
- if(it.hasNext())
- return getEditFile(it.next()).lastModified();
- return 0;
- }
-
/**
* Return the txid of the last synced transaction.
* For test use only
@@ -996,273 +921,224 @@ synchronized long getSyncTxId() {
// sets the initial capacity of the flush buffer.
- public void setBufferCapacity(int size) {
- sizeOutputFlushBuffer = size;
- }
-
-
- boolean isEmpty() throws IOException {
- return getEditLogSize() <= 0;
+ public void setOutputBufferCapacity(int size) {
+ for (JournalAndStream jas : journals) {
+ jas.manager.setOutputBufferCapacity(size);
+ }
}
/**
* Create (or find if already exists) an edit output stream, which
* streams journal records (edits) to the specified backup node.
- * Send a record, prescribing to start journal spool.
- * This should be sent via regular stream of journal records so that
- * the backup node new exactly after which record it should start spooling.
+ *
+ * The new BackupNode will start receiving edits the next time this
+ * NameNode's logs roll.
*
* @param bnReg the backup node registration information.
* @param nnReg this (active) name-node registration.
* @throws IOException
*/
- synchronized void logJSpoolStart(NamenodeRegistration bnReg, // backup node
- NamenodeRegistration nnReg) // active name-node
+ synchronized void registerBackupNode(
+ NamenodeRegistration bnReg, // backup node
+ NamenodeRegistration nnReg) // active name-node
throws IOException {
if(bnReg.isRole(NamenodeRole.CHECKPOINT))
return; // checkpoint node does not stream edits
- if(editStreams == null)
- editStreams = new ArrayList();
- EditLogOutputStream boStream = null;
- for(EditLogOutputStream eStream : editStreams) {
- if(eStream.getName().equals(bnReg.getAddress())) {
- boStream = eStream; // already there
- break;
+
+ JournalAndStream jas = findBackupJournalAndStream(bnReg);
+ if (jas != null) {
+ // already registered
+ LOG.info("Backup node " + bnReg + " re-registers");
+ return;
+ }
+
+ LOG.info("Registering new backup node: " + bnReg);
+ BackupJournalManager bjm = new BackupJournalManager(bnReg, nnReg);
+ journals.add(new JournalAndStream(bjm));
+ }
+
+ synchronized void releaseBackupStream(NamenodeRegistration registration) {
+ for (Iterator iter = journals.iterator();
+ iter.hasNext();) {
+ JournalAndStream jas = iter.next();
+ if (jas.manager instanceof BackupJournalManager &&
+ ((BackupJournalManager)jas.manager).matchesRegistration(
+ registration)) {
+ jas.abort();
+ LOG.info("Removing backup journal " + jas);
+ iter.remove();
}
}
- if(boStream == null) {
- boStream = new EditLogBackupOutputStream(bnReg, nnReg);
- editStreams.add(boStream);
+ }
+
+ /**
+ * Find the JournalAndStream associated with this BackupNode.
+ * @return null if it cannot be found
+ */
+ private synchronized JournalAndStream findBackupJournalAndStream(
+ NamenodeRegistration bnReg) {
+ for (JournalAndStream jas : journals) {
+ if (jas.manager instanceof BackupJournalManager) {
+ BackupJournalManager bjm = (BackupJournalManager)jas.manager;
+ if (bjm.matchesRegistration(bnReg)) {
+ return jas;
+ }
+ }
}
- logEdit(JSpoolStartOp.getInstance());
+ return null;
}
/**
* Write an operation to the edit log. Do not sync to persistent
* store yet.
+ */
+ synchronized void logEdit(final int length, final byte[] data) {
+ long start = beginTransaction();
+
+ mapJournalsAndReportErrors(new JournalClosure() {
+ @Override
+ public void apply(JournalAndStream jas) throws IOException {
+ if (jas.isActive()) {
+ jas.getCurrentStream().writeRaw(data, 0, length); // TODO writeRaw
+ }
+ }
+ }, "Logging edit");
+
+ endTransaction(start);
+ }
+
+ //// Iteration across journals
+ private interface JournalClosure {
+ public void apply(JournalAndStream jas) throws IOException;
+ }
+
+ /**
+ * Apply the given function across all of the journal managers, disabling
+ * any for which the closure throws an IOException.
+ * @param status message used for logging errors (e.g. "opening journal")
*/
- synchronized void logEdit(int length, byte[] data) {
- if(getNumEditStreams() == 0)
- throw new java.lang.IllegalStateException(NO_JOURNAL_STREAMS_WARNING);
- ArrayList errorStreams = null;
- long start = now();
- for(EditLogOutputStream eStream : editStreams) {
+ private void mapJournalsAndReportErrors(
+ JournalClosure closure, String status) {
+ List badJAS = Lists.newLinkedList();
+ for (JournalAndStream jas : journals) {
try {
- eStream.writeRaw(data, 0, length);
- } catch (IOException ie) {
- LOG.warn("Error in editStream " + eStream.getName(), ie);
- if(errorStreams == null)
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
- }
- }
- disableAndReportErrorOnStreams(errorStreams);
- recordTransaction(start);
- }
-
- /**
- * Iterates output streams based of the same type.
- * Type null will iterate over all streams.
- */
- private class EditStreamIterator implements Iterator {
- JournalType type;
- int prevIndex; // for remove()
- int nextIndex; // for next()
-
- EditStreamIterator(JournalType streamType) {
- this.type = streamType;
- this.nextIndex = 0;
- this.prevIndex = 0;
- }
-
- public boolean hasNext() {
- synchronized(FSEditLog.this) {
- if(editStreams == null ||
- editStreams.isEmpty() || nextIndex >= editStreams.size())
- return false;
- while(nextIndex < editStreams.size()
- && !editStreams.get(nextIndex).getType().isOfType(type))
- nextIndex++;
- return nextIndex < editStreams.size();
+ closure.apply(jas);
+ } catch (Throwable t) {
+ LOG.error("Error " + status + " (journal " + jas + ")", t);
+ badJAS.add(jas);
}
}
- public EditLogOutputStream next() {
- EditLogOutputStream stream = null;
- synchronized(FSEditLog.this) {
- stream = editStreams.get(nextIndex);
- prevIndex = nextIndex;
- nextIndex++;
- while(nextIndex < editStreams.size()
- && !editStreams.get(nextIndex).getType().isOfType(type))
- nextIndex++;
- }
- return stream;
- }
-
- public void remove() {
- nextIndex = prevIndex; // restore previous state
- removeStream(prevIndex); // remove last returned element
- hasNext(); // reset nextIndex to correct place
- }
-
- void replace(EditLogOutputStream newStream) {
- synchronized (FSEditLog.this) {
- assert 0 <= prevIndex && prevIndex < editStreams.size() :
- "Index out of bound.";
- editStreams.set(prevIndex, newStream);
- }
- }
- }
-
- /**
- * Get stream iterator for the specified type.
- */
- public Iterator
- getOutputStreamIterator(JournalType streamType) {
- return new EditStreamIterator(streamType);
- }
-
- private void closeStream(EditLogOutputStream eStream) throws IOException {
- eStream.setReadyToFlush();
- eStream.flush();
- eStream.close();
- }
-
- void incrementCheckpointTime() {
- storage.incrementCheckpointTime();
- CheckpointTimeOp op = CheckpointTimeOp.getInstance()
- .setCheckpointTime(storage.getCheckpointTime());
- logEdit(op);
- }
-
- synchronized void releaseBackupStream(NamenodeRegistration registration) {
- Iterator it =
- getOutputStreamIterator(JournalType.BACKUP);
- ArrayList errorStreams = null;
- NamenodeRegistration backupNode = null;
- while(it.hasNext()) {
- EditLogBackupOutputStream eStream = (EditLogBackupOutputStream)it.next();
- backupNode = eStream.getRegistration();
- if(backupNode.getAddress().equals(registration.getAddress()) &&
- backupNode.isRole(registration.getRole())) {
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
- break;
- }
- }
- assert backupNode == null || backupNode.isRole(NamenodeRole.BACKUP) :
- "Not a backup node corresponds to a backup stream";
- disableAndReportErrorOnStreams(errorStreams);
- }
-
- synchronized boolean checkBackupRegistration(
- NamenodeRegistration registration) {
- Iterator it =
- getOutputStreamIterator(JournalType.BACKUP);
- boolean regAllowed = !it.hasNext();
- NamenodeRegistration backupNode = null;
- ArrayList errorStreams = null;
- while(it.hasNext()) {
- EditLogBackupOutputStream eStream = (EditLogBackupOutputStream)it.next();
- backupNode = eStream.getRegistration();
- if(backupNode.getAddress().equals(registration.getAddress()) &&
- backupNode.isRole(registration.getRole())) {
- regAllowed = true; // same node re-registers
- break;
- }
- if(!eStream.isAlive()) {
- if(errorStreams == null)
- errorStreams = new ArrayList(1);
- errorStreams.add(eStream);
- regAllowed = true; // previous backup node failed
- }
- }
- assert backupNode == null || backupNode.isRole(NamenodeRole.BACKUP) :
- "Not a backup node corresponds to a backup stream";
- disableAndReportErrorOnStreams(errorStreams);
- return regAllowed;
+ disableAndReportErrorOnJournals(badJAS);
}
-
/**
- * Get the StorageDirectory for a stream
- * @param es Stream whose StorageDirectory we wish to know
- * @return the matching StorageDirectory
+ * Called when some journals experience an error in some operation.
+ * This propagates errors to the storage level.
*/
- StorageDirectory getStorageDirectoryForStream(EditLogOutputStream es) {
- String parentStorageDir = ((EditLogFileOutputStream)es).getFile().getParentFile().getParentFile().getAbsolutePath();
-
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- FSNamesystem.LOG.info("comparing: " + parentStorageDir
- + " and " + sd.getRoot().getAbsolutePath());
- if (parentStorageDir.equals(sd.getRoot().getAbsolutePath()))
- return sd;
+ private void disableAndReportErrorOnJournals(List badJournals) {
+ if (badJournals == null || badJournals.isEmpty()) {
+ return; // nothing to do
}
- return null;
- }
-
- private synchronized void disableStream(EditLogOutputStream stream) {
- try { stream.close(); } catch (IOException e) {
- // nothing to do.
- LOG.warn("Failed to close eStream " + stream.getName()
- + " before removing it (might be ok)");
- }
- editStreams.remove(stream);
-
- if (editStreams.size() <= 0) {
- String msg = "Fatal Error: All storage directories are inaccessible.";
- LOG.fatal(msg, new IOException(msg));
- Runtime.getRuntime().exit(-1);
+
+ for (JournalAndStream j : badJournals) {
+ LOG.error("Disabling journal " + j);
+ j.abort();
}
}
/**
- * Error Handling on a storageDirectory
- *
+ * Container for a JournalManager paired with its currently
+ * active stream.
+ *
+ * If a Journal gets disabled due to an error writing to its
+ * stream, then the stream will be aborted and set to null.
*/
- // NNStorageListener Interface
- @Override // NNStorageListener
- public synchronized void errorOccurred(StorageDirectory sd)
+ static class JournalAndStream {
+ private final JournalManager manager;
+ private EditLogOutputStream stream;
+ private long segmentStartsAtTxId = FSConstants.INVALID_TXID;
+
+ private JournalAndStream(JournalManager manager) {
+ this.manager = manager;
+ }
+
+ private void startLogSegment(long txId) throws IOException {
+ Preconditions.checkState(stream == null);
+ stream = manager.startLogSegment(txId);
+ segmentStartsAtTxId = txId;
+ }
+
+ private void close(long lastTxId) throws IOException {
+ Preconditions.checkArgument(lastTxId >= segmentStartsAtTxId,
+ "invalid segment: lastTxId %s >= " +
+ "segment starting txid %s", lastTxId, segmentStartsAtTxId);
+
+ if (stream == null) return;
+ stream.close();
+ manager.finalizeLogSegment(segmentStartsAtTxId, lastTxId);
+ stream = null;
+ }
+
+ private void abort() {
+ if (stream == null) return;
+ try {
+ stream.abort();
+ } catch (IOException ioe) {
+ LOG.error("Unable to abort stream " + stream, ioe);
+ }
+ stream = null;
+ segmentStartsAtTxId = FSConstants.INVALID_TXID;
+ }
+
+ private boolean isActive() {
+ return stream != null;
+ }
+
+ @VisibleForTesting
+ EditLogOutputStream getCurrentStream() {
+ return stream;
+ }
+
+ @Override
+ public String toString() {
+ return "JournalAndStream(mgr=" + manager +
+ ", " + "stream=" + stream + ")";
+ }
+
+ @VisibleForTesting
+ void setCurrentStreamForTests(EditLogOutputStream stream) {
+ this.stream = stream;
+ }
+
+ @VisibleForTesting
+ JournalManager getManager() {
+ return manager;
+ }
+
+ private EditLogInputStream getInProgressInputStream() throws IOException {
+ return manager.getInProgressInputStream(segmentStartsAtTxId);
+ }
+ }
+
+ /**
+ * @return an EditLogInputStream that reads from the same log that
+ * the edit log is currently writing. This is used from the BackupNode
+ * during edits synchronization.
+ * @throws IOException if no valid logs are available.
+ */
+ synchronized EditLogInputStream getInProgressFileInputStream()
throws IOException {
- if (editStreams == null) {
- //errors can occur on storage directories
- //before edit streams have been set up
- return;
- }
- ArrayList errorStreams
- = new ArrayList();
-
- for (EditLogOutputStream eStream : editStreams) {
- LOG.error("Unable to log edits to " + eStream.getName()
- + "; removing it");
-
- StorageDirectory streamStorageDir = getStorageDirectoryForStream(eStream);
- if (sd == streamStorageDir) {
- errorStreams.add(eStream);
+ for (JournalAndStream jas : journals) {
+ if (!jas.isActive()) continue;
+ try {
+ EditLogInputStream in = jas.getInProgressInputStream();
+ if (in != null) return in;
+ } catch (IOException ioe) {
+ LOG.warn("Unable to get the in-progress input stream from " + jas,
+ ioe);
}
}
-
- for (EditLogOutputStream eStream : errorStreams) {
- disableStream(eStream);
- }
- }
-
- @Override // NNStorageListener
- public synchronized void formatOccurred(StorageDirectory sd)
- throws IOException {
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
- createEditLogFile(NNStorage.getStorageFile(sd, NameNodeFile.EDITS));
- }
- };
-
- @Override // NNStorageListener
- public synchronized void directoryAvailable(StorageDirectory sd)
- throws IOException {
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
- File eFile = getEditFile(sd);
- addNewEditLogStream(eFile);
- }
+ throw new IOException("No in-progress stream provided edits");
}
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
index d8ecaf4e2ec..ebe19cfb08a 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
@@ -21,12 +21,12 @@
import java.io.BufferedInputStream;
import java.io.DataInputStream;
-import java.io.EOFException;
+import java.io.File;
+import java.io.FileInputStream;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
-import java.util.zip.CheckedInputStream;
import java.util.zip.Checksum;
import org.apache.hadoop.fs.permission.PermissionStatus;
@@ -37,6 +37,8 @@
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoUnderConstruction;
import org.apache.hadoop.hdfs.server.common.Storage;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.LogHeader;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.Reader;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.AddCloseOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.CancelDelegationTokenOp;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.ClearNSQuotaOp;
@@ -71,75 +73,42 @@ public FSEditLogLoader(FSNamesystem fsNamesys) {
* This is where we apply edits that we've been writing to disk all
* along.
*/
- int loadFSEdits(EditLogInputStream edits) throws IOException {
+ int loadFSEdits(EditLogInputStream edits, long expectedStartingTxId)
+ throws IOException {
long startTime = now();
- int numEdits = loadFSEdits(edits, true);
+ int numEdits = loadFSEdits(edits, true, expectedStartingTxId);
FSImage.LOG.info("Edits file " + edits.getName()
+ " of size " + edits.length() + " edits # " + numEdits
+ " loaded in " + (now()-startTime)/1000 + " seconds.");
return numEdits;
}
- /**
- * Read the header of fsedit log
- * @param in fsedit stream
- * @return the edit log version number
- * @throws IOException if error occurs
- */
- int readLogVersion(DataInputStream in) throws IOException {
- int logVersion = 0;
- // Read log file version. Could be missing.
- in.mark(4);
- // If edits log is greater than 2G, available method will return negative
- // numbers, so we avoid having to call available
- boolean available = true;
- try {
- logVersion = in.readByte();
- } catch (EOFException e) {
- available = false;
- }
- if (available) {
- in.reset();
- logVersion = in.readInt();
- if (logVersion < FSConstants.LAYOUT_VERSION) // future version
- throw new IOException(
- "Unexpected version of the file system log file: "
- + logVersion + ". Current version = "
- + FSConstants.LAYOUT_VERSION + ".");
- }
- assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION :
- "Unsupported version " + logVersion;
- return logVersion;
- }
-
- int loadFSEdits(EditLogInputStream edits, boolean closeOnExit) throws IOException {
+ int loadFSEdits(EditLogInputStream edits, boolean closeOnExit,
+ long expectedStartingTxId)
+ throws IOException {
BufferedInputStream bin = new BufferedInputStream(edits);
DataInputStream in = new DataInputStream(bin);
int numEdits = 0;
- int logVersion = 0;
try {
- logVersion = readLogVersion(in);
- Checksum checksum = null;
- if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) {
- checksum = FSEditLog.getChecksum();
- in = new DataInputStream(new CheckedInputStream(bin, checksum));
- }
-
- numEdits = loadEditRecords(logVersion, in, checksum, false);
+ LogHeader header = LogHeader.read(in);
+ numEdits = loadEditRecords(
+ header.logVersion, in, header.checksum, false,
+ expectedStartingTxId);
} finally {
if(closeOnExit)
in.close();
}
- if (logVersion != FSConstants.LAYOUT_VERSION) // other version
- numEdits++; // save this image asap
+
return numEdits;
}
@SuppressWarnings("deprecation")
int loadEditRecords(int logVersion, DataInputStream in,
- Checksum checksum, boolean closeOnExit) throws IOException {
+ Checksum checksum, boolean closeOnExit,
+ long expectedStartingTxId)
+ throws IOException {
FSDirectory fsDir = fsNamesys.dir;
int numEdits = 0;
@@ -162,6 +131,8 @@ int loadEditRecords(int logVersion, DataInputStream in,
Arrays.fill(recentOpcodeOffsets, -1);
try {
+ long txId = expectedStartingTxId - 1;
+
try {
FSEditLogOp.Reader reader = new FSEditLogOp.Reader(in, logVersion,
checksum);
@@ -169,6 +140,15 @@ int loadEditRecords(int logVersion, DataInputStream in,
while ((op = reader.readOp()) != null) {
recentOpcodeOffsets[numEdits % recentOpcodeOffsets.length] =
tracker.getPos();
+ if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) {
+ long thisTxId = op.txid;
+ if (thisTxId != txId + 1) {
+ throw new IOException("Expected transaction ID " +
+ (txId + 1) + " but got " + thisTxId);
+ }
+ txId = thisTxId;
+ }
+
numEdits++;
switch (op.opCode) {
case OP_ADD:
@@ -417,6 +397,12 @@ int loadEditRecords(int logVersion, DataInputStream in,
reassignLeaseOp.path, reassignLeaseOp.newHolder, pendingFile);
break;
}
+ case OP_START_LOG_SEGMENT:
+ case OP_END_LOG_SEGMENT: {
+ // no data in here currently.
+ numOpOther++;
+ break;
+ }
case OP_DATANODE_ADD:
case OP_DATANODE_REMOVE:
numOpOther++;
@@ -494,6 +480,61 @@ private void check203UpgradeFailure(int logVersion, IOException ex)
}
}
+ /**
+ * Return the number of valid transactions in the file. If the file is
+ * truncated during the header, returns a value indicating that there are
+ * 0 valid transactions.
+ * @throws IOException if the file cannot be read due to an IO error (eg
+ * if the log does not exist)
+ */
+ static EditLogValidation validateEditLog(File f) throws IOException {
+ FileInputStream fis = new FileInputStream(f);
+ try {
+ PositionTrackingInputStream tracker = new PositionTrackingInputStream(
+ new BufferedInputStream(fis));
+ DataInputStream dis = new DataInputStream(tracker);
+ LogHeader header;
+ try {
+ header = LogHeader.read(dis);
+ } catch (Throwable t) {
+ FSImage.LOG.debug("Unable to read header from " + f +
+ " -> no valid transactions in this file.");
+ return new EditLogValidation(0, 0);
+ }
+
+ Reader reader = new FSEditLogOp.Reader(dis, header.logVersion, header.checksum);
+ long numValid = 0;
+ long lastPos = 0;
+ try {
+ while (true) {
+ lastPos = tracker.getPos();
+ if (reader.readOp() == null) {
+ break;
+ }
+ numValid++;
+ }
+ } catch (Throwable t) {
+ // Catch Throwable and not just IOE, since bad edits may generate
+ // NumberFormatExceptions, AssertionErrors, OutOfMemoryErrors, etc.
+ FSImage.LOG.debug("Caught exception after reading " + numValid +
+ " ops from " + f + " while determining its valid length.", t);
+ }
+ return new EditLogValidation(lastPos, numValid);
+ } finally {
+ fis.close();
+ }
+ }
+
+ static class EditLogValidation {
+ long validLength;
+ long numTransactions;
+
+ EditLogValidation(long validLength, long numTransactions) {
+ this.validLength = validLength;
+ this.numTransactions = numTransactions;
+ }
+ }
+
/**
* Stream wrapper that keeps track of the current file position.
*/
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
index fb6e65f23c2..00ce353b9ac 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOp.java
@@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
+import java.util.zip.CheckedInputStream;
import java.util.zip.Checksum;
import java.util.EnumMap;
@@ -29,10 +30,12 @@
import org.apache.hadoop.fs.permission.PermissionStatus;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
+import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.hdfs.protocol.LayoutVersion;
import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
-import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
+import org.apache.hadoop.hdfs.server.common.GenerationStamp;
+import org.apache.hadoop.hdfs.server.common.Storage;
import static org.apache.hadoop.hdfs.server.namenode.FSEditLogOpCodes.*;
import org.apache.hadoop.security.token.delegation.DelegationKey;
@@ -61,6 +64,8 @@
@InterfaceStability.Unstable
public abstract class FSEditLogOp {
final FSEditLogOpCodes opCode;
+ long txid;
+
@SuppressWarnings("deprecation")
private static ThreadLocal> opInstances =
@@ -93,8 +98,10 @@ protected EnumMap initialValue() {
instances.put(OP_CANCEL_DELEGATION_TOKEN,
new CancelDelegationTokenOp());
instances.put(OP_UPDATE_MASTER_KEY, new UpdateMasterKeyOp());
- instances.put(OP_CHECKPOINT_TIME, new CheckpointTimeOp());
- instances.put(OP_JSPOOL_START, new JSpoolStartOp());
+ instances.put(OP_START_LOG_SEGMENT,
+ new LogSegmentOp(OP_START_LOG_SEGMENT));
+ instances.put(OP_END_LOG_SEGMENT,
+ new LogSegmentOp(OP_END_LOG_SEGMENT));
return instances;
}
};
@@ -105,6 +112,11 @@ protected EnumMap initialValue() {
*/
private FSEditLogOp(FSEditLogOpCodes opCode) {
this.opCode = opCode;
+ this.txid = 0;
+ }
+
+ public void setTransactionId(long txid) {
+ this.txid = txid;
}
abstract void readFields(DataInputStream in, int logVersion)
@@ -1207,6 +1219,28 @@ void readFields(DataInputStream in, int logVersion)
this.key.readFields(in);
}
}
+
+ static class LogSegmentOp extends FSEditLogOp {
+ private LogSegmentOp(FSEditLogOpCodes code) {
+ super(code);
+ assert code == OP_START_LOG_SEGMENT ||
+ code == OP_END_LOG_SEGMENT : "Bad op: " + code;
+ }
+
+ static LogSegmentOp getInstance(FSEditLogOpCodes code) {
+ return (LogSegmentOp)opInstances.get().get(code);
+ }
+
+ public void readFields(DataInputStream in, int logVersion)
+ throws IOException {
+ // no data stored in these ops yet
+ }
+
+ @Override
+ void writeFields(DataOutputStream out) throws IOException {
+ // no data stored
+ }
+ }
static class InvalidOp extends FSEditLogOp {
private InvalidOp() {
@@ -1228,54 +1262,6 @@ void readFields(DataInputStream in, int logVersion)
}
}
- static class JSpoolStartOp extends FSEditLogOp {
- private JSpoolStartOp() {
- super(OP_JSPOOL_START);
- }
-
- static JSpoolStartOp getInstance() {
- return (JSpoolStartOp)opInstances.get().get(OP_JSPOOL_START);
- }
-
- @Override
- void writeFields(DataOutputStream out) throws IOException {
- }
-
- @Override
- void readFields(DataInputStream in, int logVersion)
- throws IOException {
- }
- }
-
- static class CheckpointTimeOp extends FSEditLogOp {
- long checkpointTime;
-
- private CheckpointTimeOp() {
- super(OP_CHECKPOINT_TIME);
- }
-
- CheckpointTimeOp setCheckpointTime(long time) {
- this.checkpointTime = time;
- return this;
- }
-
- static CheckpointTimeOp getInstance() {
- return (CheckpointTimeOp)opInstances.get()
- .get(OP_CHECKPOINT_TIME);
- }
-
- @Override
- void writeFields(DataOutputStream out) throws IOException {
- new LongWritable(checkpointTime).write(out);
- }
-
- @Override
- void readFields(DataInputStream in, int logVersion)
- throws IOException {
- this.checkpointTime = readLong(in);
- }
- }
-
static private short readShort(DataInputStream in) throws IOException {
return Short.parseShort(FSImageSerialization.readString(in));
}
@@ -1337,6 +1323,62 @@ private static long readLongWritable(DataInputStream in) throws IOException {
return longWritable.get();
}
}
+
+ /**
+ * Class to encapsulate the header at the top of a log file.
+ */
+ static class LogHeader {
+ final int logVersion;
+ final Checksum checksum;
+
+ public LogHeader(int logVersion, Checksum checksum) {
+ this.logVersion = logVersion;
+ this.checksum = checksum;
+ }
+
+ static LogHeader read(DataInputStream in) throws IOException {
+ int logVersion = 0;
+
+ logVersion = FSEditLogOp.LogHeader.readLogVersion(in);
+ Checksum checksum = null;
+ if (LayoutVersion.supports(Feature.EDITS_CHESKUM, logVersion)) {
+ checksum = FSEditLog.getChecksum();
+ }
+ return new LogHeader(logVersion, checksum);
+ }
+
+ /**
+ * Read the header of fsedit log
+ * @param in fsedit stream
+ * @return the edit log version number
+ * @throws IOException if error occurs
+ */
+ private static int readLogVersion(DataInputStream in) throws IOException {
+ int logVersion = 0;
+ // Read log file version. Could be missing.
+ in.mark(4);
+ // If edits log is greater than 2G, available method will return negative
+ // numbers, so we avoid having to call available
+ boolean available = true;
+ try {
+ logVersion = in.readByte();
+ } catch (EOFException e) {
+ available = false;
+ }
+ if (available) {
+ in.reset();
+ logVersion = in.readInt();
+ if (logVersion < FSConstants.LAYOUT_VERSION) // future version
+ throw new IOException(
+ "Unexpected version of the file system log file: "
+ + logVersion + ". Current version = "
+ + FSConstants.LAYOUT_VERSION + ".");
+ }
+ assert logVersion <= Storage.LAST_UPGRADABLE_LAYOUT_VERSION :
+ "Unsupported version " + logVersion;
+ return logVersion;
+ }
+ }
/**
* Class for writing editlog ops
@@ -1357,6 +1399,7 @@ public Writer(DataOutputBuffer out) {
public void writeOp(FSEditLogOp op) throws IOException {
int start = buf.getLength();
buf.writeByte(op.opCode.getOpCode());
+ buf.writeLong(op.txid);
op.writeFields(buf);
int end = buf.getLength();
Checksum checksum = FSEditLog.getChecksum();
@@ -1384,7 +1427,12 @@ public static class Reader {
@SuppressWarnings("deprecation")
public Reader(DataInputStream in, int logVersion,
Checksum checksum) {
- this.in = in;
+ if (checksum != null) {
+ this.in = new DataInputStream(
+ new CheckedInputStream(in, checksum));
+ } else {
+ this.in = in;
+ }
this.logVersion = logVersion;
this.checksum = checksum;
}
@@ -1423,9 +1471,15 @@ public FSEditLogOp readOp() throws IOException {
if (op == null) {
throw new IOException("Read invalid opcode " + opCode);
}
+
+ if (LayoutVersion.supports(Feature.STORED_TXIDS, logVersion)) {
+ // Read the txid
+ op.setTransactionId(in.readLong());
+ }
+
op.readFields(in, logVersion);
- validateChecksum(in, checksum);
+ validateChecksum(in, checksum, op.txid);
return op;
}
@@ -1433,7 +1487,8 @@ public FSEditLogOp readOp() throws IOException {
* Validate a transaction's checksum
*/
private void validateChecksum(DataInputStream in,
- Checksum checksum)
+ Checksum checksum,
+ long txid)
throws IOException {
if (checksum != null) {
int calculatedChecksum = (int)checksum.getValue();
@@ -1441,7 +1496,7 @@ private void validateChecksum(DataInputStream in,
if (readChecksum != calculatedChecksum) {
throw new ChecksumException(
"Transaction is corrupt. Calculated checksum is " +
- calculatedChecksum + " but read checksum " + readChecksum, -1);
+ calculatedChecksum + " but read checksum " + readChecksum, txid);
}
}
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
index 3c59b67fe62..220c267f085 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogOpCodes.java
@@ -54,10 +54,8 @@ public enum FSEditLogOpCodes {
OP_CANCEL_DELEGATION_TOKEN ((byte) 20),
OP_UPDATE_MASTER_KEY ((byte) 21),
OP_REASSIGN_LEASE ((byte) 22),
- // must be same as NamenodeProtocol.JA_JSPOOL_START
- OP_JSPOOL_START ((byte)102),
- // must be same as NamenodeProtocol.JA_CHECKPOINT_TIME
- OP_CHECKPOINT_TIME ((byte)103);
+ OP_END_LOG_SEGMENT ((byte) 23),
+ OP_START_LOG_SEGMENT ((byte) 24);
private byte opCode;
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
index afa347c44f4..993dd8cd44e 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java
@@ -21,17 +21,13 @@
import java.io.File;
import java.io.IOException;
import java.net.URI;
-import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
-import java.util.Date;
import java.util.HashMap;
-import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -46,42 +42,45 @@
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.Storage.StorageState;
-import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.common.Util;
import static org.apache.hadoop.hdfs.server.common.Util.now;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
+import org.apache.hadoop.hdfs.server.namenode.FSImageStorageInspector.LoadPlan;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NNStorageListener;
import org.apache.hadoop.hdfs.server.protocol.CheckpointCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
+import org.apache.hadoop.hdfs.util.MD5FileUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.hdfs.DFSConfigKeys;
+import com.google.common.base.Joiner;
+import com.google.common.base.Preconditions;
+import com.google.common.collect.Lists;
+
/**
* FSImage handles checkpointing and logging of the namespace edits.
*
*/
@InterfaceAudience.Private
@InterfaceStability.Evolving
-public class FSImage implements NNStorageListener, Closeable {
+public class FSImage implements Closeable {
protected static final Log LOG = LogFactory.getLog(FSImage.class.getName());
- private static final SimpleDateFormat DATE_FORM =
- new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
-
- // checkpoint states
- enum CheckpointStates{START, ROLLED_EDITS, UPLOAD_START, UPLOAD_DONE; }
-
protected FSNamesystem namesystem = null;
protected FSEditLog editLog = null;
private boolean isUpgradeFinalized = false;
- protected MD5Hash newImageDigest = null;
- protected NNStorage storage = null;
+ protected NNStorage storage;
+
+ /**
+ * The last transaction ID that was either loaded from an image
+ * or loaded by loading edits files.
+ */
+ protected long lastAppliedTxId = 0;
/**
* URIs for importing an image from a checkpoint. In the default case,
@@ -90,75 +89,69 @@ enum CheckpointStates{START, ROLLED_EDITS, UPLOAD_START, UPLOAD_DONE; }
private Collection checkpointDirs;
private Collection checkpointEditsDirs;
- private Configuration conf;
+ final private Configuration conf;
+
+ private final NNStorageRetentionManager archivalManager;
/**
- * Can fs-image be rolled?
- */
- volatile protected CheckpointStates ckptState = FSImage.CheckpointStates.START;
-
- /**
- */
- FSImage() {
- this((FSNamesystem)null);
- }
-
- /**
- * Constructor
+ * Construct an FSImage.
* @param conf Configuration
+ * @see #FSImage(Configuration conf, FSNamesystem ns,
+ * Collection imageDirs, Collection editsDirs)
+ * @throws IOException if default directories are invalid.
*/
- FSImage(Configuration conf) throws IOException {
- this();
- this.conf = conf; // TODO we have too many constructors, this is a mess
-
- if(conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY,
- DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT)) {
- NameNode.LOG.info("set FSImage.restoreFailedStorage");
- storage.setRestoreFailedStorage(true);
- }
- setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null),
- FSImage.getCheckpointEditsDirs(conf, null));
+ public FSImage(Configuration conf) throws IOException {
+ this(conf, (FSNamesystem)null);
}
- private FSImage(FSNamesystem ns) {
- this.conf = new Configuration();
-
- storage = new NNStorage(conf);
+ /**
+ * Construct an FSImage
+ * @param conf Configuration
+ * @param ns The FSNamesystem using this image.
+ * @see #FSImage(Configuration conf, FSNamesystem ns,
+ * Collection imageDirs, Collection editsDirs)
+ * @throws IOException if default directories are invalid.
+ */
+ private FSImage(Configuration conf, FSNamesystem ns) throws IOException {
+ this(conf, ns,
+ FSNamesystem.getNamespaceDirs(conf),
+ FSNamesystem.getNamespaceEditsDirs(conf));
+ }
+
+ /**
+ * Construct the FSImage. Set the default checkpoint directories.
+ *
+ * Setup storage and initialize the edit log.
+ *
+ * @param conf Configuration
+ * @param ns The FSNamesystem using this image.
+ * @param imageDirs Directories the image can be stored in.
+ * @param editsDirs Directories the editlog can be stored in.
+ * @throws IOException if directories are invalid.
+ */
+ protected FSImage(Configuration conf, FSNamesystem ns,
+ Collection imageDirs, Collection editsDirs)
+ throws IOException {
+ this.conf = conf;
+ setCheckpointDirectories(FSImage.getCheckpointDirs(conf, null),
+ FSImage.getCheckpointEditsDirs(conf, null));
+
+ storage = new NNStorage(conf, imageDirs, editsDirs);
if (ns != null) {
storage.setUpgradeManager(ns.upgradeManager);
}
- storage.registerListener(this);
+
+ if(conf.getBoolean(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_KEY,
+ DFSConfigKeys.DFS_NAMENODE_NAME_DIR_RESTORE_DEFAULT)) {
+ storage.setRestoreFailedStorage(true);
+ }
this.editLog = new FSEditLog(storage);
setFSNamesystem(ns);
+
+ archivalManager = new NNStorageRetentionManager(conf, storage, editLog);
}
- /**
- * @throws IOException
- */
- FSImage(Collection fsDirs, Collection fsEditsDirs)
- throws IOException {
- this();
- storage.setStorageDirectories(fsDirs, fsEditsDirs);
- }
-
- public FSImage(StorageInfo storageInfo, String bpid) {
- storage = new NNStorage(storageInfo, bpid);
- }
-
- /**
- * Represents an Image (image and edit file).
- * @throws IOException
- */
- FSImage(URI imageDir) throws IOException {
- this();
- ArrayList dirs = new ArrayList(1);
- ArrayList editsDirs = new ArrayList(1);
- dirs.add(imageDir);
- editsDirs.add(imageDir);
- storage.setStorageDirectories(dirs, editsDirs);
- }
-
protected FSNamesystem getFSNamesystem() {
return namesystem;
}
@@ -169,33 +162,37 @@ void setFSNamesystem(FSNamesystem ns) {
storage.setUpgradeManager(ns.upgradeManager);
}
}
-
+
void setCheckpointDirectories(Collection dirs,
Collection editsDirs) {
checkpointDirs = dirs;
checkpointEditsDirs = editsDirs;
}
+ void format(String clusterId) throws IOException {
+ storage.format(clusterId);
+ saveFSImageInAllDirs(0);
+ }
+
/**
* Analyze storage directories.
* Recover from previous transitions if required.
* Perform fs state transition if necessary depending on the namespace info.
* Read storage info.
*
- * @param dataDirs
- * @param startOpt startup option
* @throws IOException
* @return true if the image needs to be saved or false otherwise
*/
- boolean recoverTransitionRead(Collection dataDirs,
- Collection editsDirs,
- StartupOption startOpt)
+ boolean recoverTransitionRead(StartupOption startOpt)
throws IOException {
assert startOpt != StartupOption.FORMAT :
"NameNode formatting should be performed before reading the image";
+ Collection imageDirs = storage.getImageDirectories();
+ Collection editsDirs = storage.getEditsDirectories();
+
// none of the data dirs exist
- if((dataDirs.size() == 0 || editsDirs.size() == 0)
+ if((imageDirs.size() == 0 || editsDirs.size() == 0)
&& startOpt != StartupOption.IMPORT)
throw new IOException(
"All specified directories are not accessible or do not exist.");
@@ -210,50 +207,24 @@ boolean recoverTransitionRead(Collection dataDirs,
throw new IOException("Cannot import image from a checkpoint. "
+ "\"dfs.namenode.checkpoint.dir\" is not set." );
- storage.setStorageDirectories(dataDirs, editsDirs);
// 1. For each data directory calculate its state and
// check whether all is consistent before transitioning.
Map dataDirStates =
new HashMap();
- boolean isFormatted = false;
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- StorageState curState;
- try {
- curState = sd.analyzeStorage(startOpt, storage);
- // sd is locked but not opened
- switch(curState) {
- case NON_EXISTENT:
- // name-node fails if any of the configured storage dirs are missing
- throw new InconsistentFSStateException(sd.getRoot(),
- "storage directory does not exist or is not accessible.");
- case NOT_FORMATTED:
- break;
- case NORMAL:
- break;
- default: // recovery is possible
- sd.doRecover(curState);
- }
- if (curState != StorageState.NOT_FORMATTED
- && startOpt != StartupOption.ROLLBACK) {
- // read and verify consistency with other directories
- storage.readProperties(sd);
- isFormatted = true;
- }
- if (startOpt == StartupOption.IMPORT && isFormatted)
- // import of a checkpoint is allowed only into empty image directories
- throw new IOException("Cannot import image from a checkpoint. "
- + " NameNode already contains an image in "+ sd.getRoot());
- } catch (IOException ioe) {
- sd.unlock();
- throw ioe;
- }
- dataDirStates.put(sd,curState);
+ boolean isFormatted = recoverStorageDirs(startOpt, dataDirStates);
+
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("Data dir states:\n " +
+ Joiner.on("\n ").withKeyValueSeparator(": ")
+ .join(dataDirStates));
}
if (!isFormatted && startOpt != StartupOption.ROLLBACK
- && startOpt != StartupOption.IMPORT)
- throw new IOException("NameNode is not formatted.");
+ && startOpt != StartupOption.IMPORT) {
+ throw new IOException("NameNode is not formatted.");
+ }
+
+
int layoutVersion = storage.getLayoutVersion();
if (layoutVersion < Storage.LAST_PRE_UPGRADE_LAYOUT_VERSION) {
NNStorage.checkVersionUpgradable(storage.getLayoutVersion());
@@ -274,7 +245,6 @@ boolean recoverTransitionRead(Collection dataDirs,
storage.verifyDistributedUpgradeProgress(startOpt);
// 2. Format unformatted dirs.
- storage.setCheckpointTime(0L);
for (Iterator it = storage.dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
StorageState curState = dataDirStates.get(sd);
@@ -307,13 +277,55 @@ boolean recoverTransitionRead(Collection dataDirs,
// just load the image
}
- boolean needToSave = loadFSImage();
-
- assert editLog != null : "editLog must be initialized";
- if(!editLog.isOpen())
- editLog.open();
-
- return needToSave;
+ return loadFSImage();
+ }
+
+ /**
+ * For each storage directory, performs recovery of incomplete transitions
+ * (eg. upgrade, rollback, checkpoint) and inserts the directory's storage
+ * state into the dataDirStates map.
+ * @param dataDirStates output of storage directory states
+ * @return true if there is at least one valid formatted storage directory
+ */
+ private boolean recoverStorageDirs(StartupOption startOpt,
+ Map dataDirStates) throws IOException {
+ boolean isFormatted = false;
+ for (Iterator it =
+ storage.dirIterator(); it.hasNext();) {
+ StorageDirectory sd = it.next();
+ StorageState curState;
+ try {
+ curState = sd.analyzeStorage(startOpt, storage);
+ // sd is locked but not opened
+ switch(curState) {
+ case NON_EXISTENT:
+ // name-node fails if any of the configured storage dirs are missing
+ throw new InconsistentFSStateException(sd.getRoot(),
+ "storage directory does not exist or is not accessible.");
+ case NOT_FORMATTED:
+ break;
+ case NORMAL:
+ break;
+ default: // recovery is possible
+ sd.doRecover(curState);
+ }
+ if (curState != StorageState.NOT_FORMATTED
+ && startOpt != StartupOption.ROLLBACK) {
+ // read and verify consistency with other directories
+ storage.readProperties(sd);
+ isFormatted = true;
+ }
+ if (startOpt == StartupOption.IMPORT && isFormatted)
+ // import of a checkpoint is allowed only into empty image directories
+ throw new IOException("Cannot import image from a checkpoint. "
+ + " NameNode already contains an image in " + sd.getRoot());
+ } catch (IOException ioe) {
+ sd.unlock();
+ throw ioe;
+ }
+ dataDirStates.put(sd,curState);
+ }
+ return isFormatted;
}
private void doUpgrade() throws IOException {
@@ -342,12 +354,9 @@ private void doUpgrade() throws IOException {
storage.cTime = now(); // generate new cTime for the state
int oldLV = storage.getLayoutVersion();
storage.layoutVersion = FSConstants.LAYOUT_VERSION;
- storage.setCheckpointTime(now());
List errorSDs =
Collections.synchronizedList(new ArrayList());
- List saveThreads = new ArrayList();
- File curDir, prevDir, tmpDir;
for (Iterator it = storage.dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
LOG.info("Starting upgrade of image directory " + sd.getRoot()
@@ -356,9 +365,9 @@ private void doUpgrade() throws IOException {
+ ".\n new LV = " + storage.getLayoutVersion()
+ "; new CTime = " + storage.getCTime());
try {
- curDir = sd.getCurrentDir();
- prevDir = sd.getPreviousDir();
- tmpDir = sd.getPreviousTmp();
+ File curDir = sd.getCurrentDir();
+ File prevDir = sd.getPreviousDir();
+ File tmpDir = sd.getPreviousTmp();
assert curDir.exists() : "Current directory must exist.";
assert !prevDir.exists() : "prvious directory must not exist.";
assert !tmpDir.exists() : "prvious.tmp directory must not exist.";
@@ -367,27 +376,30 @@ private void doUpgrade() throws IOException {
// rename current to tmp
NNStorage.rename(curDir, tmpDir);
- // launch thread to save new image
- FSImageSaver saver = new FSImageSaver(sd, errorSDs);
- Thread saveThread = new Thread(saver, saver.toString());
- saveThreads.add(saveThread);
- saveThread.start();
-
+ if (!curDir.mkdir()) {
+ throw new IOException("Cannot create directory " + curDir);
+ }
} catch (Exception e) {
- LOG.error("Failed upgrade of image directory " + sd.getRoot(), e);
+ LOG.error("Failed to move aside pre-upgrade storage " +
+ "in image directory " + sd.getRoot(), e);
errorSDs.add(sd);
continue;
}
}
- waitForThreads(saveThreads);
- saveThreads.clear();
+ storage.reportErrorsOnDirectories(errorSDs);
+ errorSDs.clear();
+
+ saveFSImageInAllDirs(editLog.getLastWrittenTxId());
for (Iterator it = storage.dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
- if (errorSDs.contains(sd)) continue;
try {
- prevDir = sd.getPreviousDir();
- tmpDir = sd.getPreviousTmp();
+ // Write the version file, since saveFsImage above only makes the
+ // fsimage_, and the directory is otherwise empty.
+ storage.writeProperties(sd);
+
+ File prevDir = sd.getPreviousDir();
+ File tmpDir = sd.getPreviousTmp();
// rename tmp to previous
NNStorage.rename(tmpDir, prevDir);
} catch (IOException ioe) {
@@ -397,15 +409,16 @@ private void doUpgrade() throws IOException {
}
LOG.info("Upgrade of " + sd.getRoot() + " is complete.");
}
+ storage.reportErrorsOnDirectories(errorSDs);
+
isUpgradeFinalized = false;
- if (!errorSDs.isEmpty()) {
- storage.reportErrorsOnDirectories(errorSDs);
+ if (!storage.getRemovedStorageDirs().isEmpty()) {
//during upgrade, it's a fatal error to fail any storage directory
- throw new IOException("Upgrade failed in " + errorSDs.size()
+ throw new IOException("Upgrade failed in "
+ + storage.getRemovedStorageDirs().size()
+ " storage directory(ies), previously logged.");
}
storage.initializeDistributedUpgrade();
- editLog.open();
}
private void doRollback() throws IOException {
@@ -413,7 +426,7 @@ private void doRollback() throws IOException {
// a previous fs states in at least one of the storage directories.
// Directories that don't have previous state do not rollback
boolean canRollback = false;
- FSImage prevState = new FSImage(getFSNamesystem());
+ FSImage prevState = new FSImage(conf, getFSNamesystem());
prevState.getStorage().layoutVersion = FSConstants.LAYOUT_VERSION;
for (Iterator it = storage.dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
@@ -499,25 +512,27 @@ private void doFinalize(StorageDirectory sd) throws IOException {
*/
void doImportCheckpoint() throws IOException {
FSNamesystem fsNamesys = getFSNamesystem();
- FSImage ckptImage = new FSImage(fsNamesys);
+ FSImage ckptImage = new FSImage(conf, fsNamesys,
+ checkpointDirs, checkpointEditsDirs);
// replace real image with the checkpoint image
FSImage realImage = fsNamesys.getFSImage();
assert realImage == this;
fsNamesys.dir.fsImage = ckptImage;
// load from the checkpoint dirs
try {
- ckptImage.recoverTransitionRead(checkpointDirs, checkpointEditsDirs,
- StartupOption.REGULAR);
+ ckptImage.recoverTransitionRead(StartupOption.REGULAR);
} finally {
ckptImage.close();
}
// return back the real image
realImage.getStorage().setStorageInfo(ckptImage.getStorage());
- storage.setCheckpointTime(ckptImage.getStorage().getCheckpointTime());
+ realImage.getEditLog().setNextTxId(ckptImage.getEditLog().getLastWrittenTxId()+1);
+
fsNamesys.dir.fsImage = realImage;
realImage.getStorage().setBlockPoolID(ckptImage.getBlockPoolID());
// and save it but keep the same checkpointTime
- saveNamespace(false);
+ saveNamespace();
+ getStorage().writeAll();
}
void finalizeUpgrade() throws IOException {
@@ -535,51 +550,23 @@ public FSEditLog getEditLog() {
return editLog;
}
- //
- // Atomic move sequence, to recover from interrupted checkpoint
- //
- boolean recoverInterruptedCheckpoint(StorageDirectory nameSD,
- StorageDirectory editsSD)
- throws IOException {
- boolean needToSave = false;
- File curFile = NNStorage.getStorageFile(nameSD, NameNodeFile.IMAGE);
- File ckptFile = NNStorage.getStorageFile(nameSD, NameNodeFile.IMAGE_NEW);
+ void openEditLog() throws IOException {
+ assert editLog != null : "editLog must be initialized";
+ Preconditions.checkState(!editLog.isOpen(),
+ "edit log should not yet be open");
+ editLog.open();
+ storage.writeTransactionIdFileToStorage(editLog.getCurSegmentTxId());
+ };
+
+ /**
+ * Toss the current image and namesystem, reloading from the specified
+ * file.
+ */
+ void reloadFromImageFile(File file) throws IOException {
+ namesystem.dir.reset();
- //
- // If we were in the midst of a checkpoint
- //
- if (ckptFile.exists()) {
- needToSave = true;
- if (NNStorage.getStorageFile(editsSD, NameNodeFile.EDITS_NEW).exists()) {
- //
- // checkpointing migth have uploaded a new
- // merged image, but we discard it here because we are
- // not sure whether the entire merged image was uploaded
- // before the namenode crashed.
- //
- if (!ckptFile.delete()) {
- throw new IOException("Unable to delete " + ckptFile);
- }
- } else {
- //
- // checkpointing was in progress when the namenode
- // shutdown. The fsimage.ckpt was created and the edits.new
- // file was moved to edits. We complete that checkpoint by
- // moving fsimage.new to fsimage. There is no need to
- // update the fstime file here. renameTo fails on Windows
- // if the destination file already exists.
- //
- if (!ckptFile.renameTo(curFile)) {
- if (!curFile.delete())
- LOG.warn("Unable to delete dir " + curFile + " before rename");
- if (!ckptFile.renameTo(curFile)) {
- throw new IOException("Unable to rename " + ckptFile +
- " to " + curFile);
- }
- }
- }
- }
- return needToSave;
+ LOG.debug("Reloading namespace from " + file);
+ loadFSImage(file);
}
/**
@@ -598,124 +585,131 @@ boolean recoverInterruptedCheckpoint(StorageDirectory nameSD,
* @throws IOException
*/
boolean loadFSImage() throws IOException {
- long latestNameCheckpointTime = Long.MIN_VALUE;
- long latestEditsCheckpointTime = Long.MIN_VALUE;
- boolean needToSave = false;
- isUpgradeFinalized = true;
+ FSImageStorageInspector inspector = storage.readAndInspectDirs();
- StorageDirectory latestNameSD = null;
- StorageDirectory latestEditsSD = null;
+ isUpgradeFinalized = inspector.isUpgradeFinalized();
- Collection imageDirs = new ArrayList();
- Collection editsDirs = new ArrayList();
+ boolean needToSave = inspector.needToSave();
- // Set to determine if all of storageDirectories share the same checkpoint
- Set checkpointTimes = new HashSet();
+ // Plan our load. This will throw if it's impossible to load from the
+ // data that's available.
+ LoadPlan loadPlan = inspector.createLoadPlan();
+ LOG.debug("Planning to load image using following plan:\n" + loadPlan);
- // Process each of the storage directories to find the pair of
- // newest image file and edit file
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
-
- // Was the file just formatted?
- if (!sd.getVersionFile().exists()) {
- needToSave |= true;
- continue;
- }
-
- boolean imageExists = false;
- boolean editsExists = false;
-
- // Determine if sd is image, edits or both
- if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) {
- imageExists = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE).exists();
- imageDirs.add(sd.getRoot().getCanonicalPath());
- }
-
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
- editsExists = NNStorage.getStorageFile(sd, NameNodeFile.EDITS).exists();
- editsDirs.add(sd.getRoot().getCanonicalPath());
- }
-
- long checkpointTime = storage.readCheckpointTime(sd);
-
- checkpointTimes.add(checkpointTime);
-
- if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE) &&
- (latestNameCheckpointTime < checkpointTime) && imageExists) {
- latestNameCheckpointTime = checkpointTime;
- latestNameSD = sd;
- }
-
- if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS) &&
- (latestEditsCheckpointTime < checkpointTime) && editsExists) {
- latestEditsCheckpointTime = checkpointTime;
- latestEditsSD = sd;
- }
-
- // check that we have a valid, non-default checkpointTime
- if (checkpointTime <= 0L)
- needToSave |= true;
-
- // set finalized flag
- isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists();
- }
-
- // We should have at least one image and one edits dirs
- if (latestNameSD == null)
- throw new IOException("Image file is not found in " + imageDirs);
- if (latestEditsSD == null)
- throw new IOException("Edits file is not found in " + editsDirs);
-
- // Make sure we are loading image and edits from same checkpoint
- if (latestNameCheckpointTime > latestEditsCheckpointTime
- && latestNameSD != latestEditsSD
- && latestNameSD.getStorageDirType() == NameNodeDirType.IMAGE
- && latestEditsSD.getStorageDirType() == NameNodeDirType.EDITS) {
- // This is a rare failure when NN has image-only and edits-only
- // storage directories, and fails right after saving images,
- // in some of the storage directories, but before purging edits.
- // See -NOTE- in saveNamespace().
- LOG.error("This is a rare failure scenario!!!");
- LOG.error("Image checkpoint time " + latestNameCheckpointTime +
- " > edits checkpoint time " + latestEditsCheckpointTime);
- LOG.error("Name-node will treat the image as the latest state of " +
- "the namespace. Old edits will be discarded.");
- } else if (latestNameCheckpointTime != latestEditsCheckpointTime)
- throw new IOException("Inconsistent storage detected, " +
- "image and edits checkpoint times do not match. " +
- "image checkpoint time = " + latestNameCheckpointTime +
- "edits checkpoint time = " + latestEditsCheckpointTime);
-
- // If there was more than one checkpointTime recorded we should save
- needToSave |= checkpointTimes.size() != 1;
// Recover from previous interrupted checkpoint, if any
- needToSave |= recoverInterruptedCheckpoint(latestNameSD, latestEditsSD);
+ needToSave |= loadPlan.doRecovery();
//
// Load in bits
//
- storage.readProperties(latestNameSD);
- needToSave |= loadFSImage(NNStorage.getStorageFile(latestNameSD,
- NameNodeFile.IMAGE));
+ StorageDirectory sdForProperties =
+ loadPlan.getStorageDirectoryForProperties();
+ storage.readProperties(sdForProperties);
+ File imageFile = loadPlan.getImageFile();
+
+ try {
+ if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT,
+ getLayoutVersion())) {
+ // For txid-based layout, we should have a .md5 file
+ // next to the image file
+ loadFSImage(imageFile);
+ } else if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM,
+ getLayoutVersion())) {
+ // In 0.22, we have the checksum stored in the VERSION file.
+ String md5 = storage.getDeprecatedProperty(
+ NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY);
+ if (md5 == null) {
+ throw new InconsistentFSStateException(sdForProperties.getRoot(),
+ "Message digest property " +
+ NNStorage.DEPRECATED_MESSAGE_DIGEST_PROPERTY +
+ " not set for storage directory " + sdForProperties.getRoot());
+ }
+ loadFSImage(imageFile, new MD5Hash(md5));
+ } else {
+ // We don't have any record of the md5sum
+ loadFSImage(imageFile, null);
+ }
+ } catch (IOException ioe) {
+ throw new IOException("Failed to load image from " + loadPlan.getImageFile(), ioe);
+ }
- // Load latest edits
- if (latestNameCheckpointTime > latestEditsCheckpointTime)
- // the image is already current, discard edits
- needToSave |= true;
- else // latestNameCheckpointTime == latestEditsCheckpointTime
- needToSave |= (loadFSEdits(latestEditsSD) > 0);
+ long numLoaded = loadEdits(loadPlan.getEditsFiles());
+ needToSave |= needsResaveBasedOnStaleCheckpoint(imageFile, numLoaded);
+ // update the txid for the edit log
+ editLog.setNextTxId(storage.getMostRecentCheckpointTxId() + numLoaded + 1);
return needToSave;
}
+
+ /**
+ * @param imageFile the image file that was loaded
+ * @param numEditsLoaded the number of edits loaded from edits logs
+ * @return true if the NameNode should automatically save the namespace
+ * when it is started, due to the latest checkpoint being too old.
+ */
+ private boolean needsResaveBasedOnStaleCheckpoint(
+ File imageFile, long numEditsLoaded) {
+ final long checkpointPeriod = conf.getLong(
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
+ final long checkpointTxnCount = conf.getLong(
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
+ DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
+ long checkpointAge = System.currentTimeMillis() - imageFile.lastModified();
+
+ return (checkpointAge > checkpointPeriod * 1000) ||
+ (numEditsLoaded > checkpointTxnCount);
+ }
+
+ /**
+ * Load the specified list of edit files into the image.
+ * @return the number of transactions loaded
+ */
+ protected long loadEdits(List editLogs) throws IOException {
+ LOG.debug("About to load edits:\n " + Joiner.on("\n ").join(editLogs));
+
+ long startingTxId = getLastAppliedTxId() + 1;
+
+ FSEditLogLoader loader = new FSEditLogLoader(namesystem);
+ int numLoaded = 0;
+ // Load latest edits
+ for (File edits : editLogs) {
+ LOG.debug("Reading " + edits + " expecting start txid #" + startingTxId);
+ EditLogFileInputStream editIn = new EditLogFileInputStream(edits);
+ int thisNumLoaded = loader.loadFSEdits(editIn, startingTxId);
+ startingTxId += thisNumLoaded;
+ numLoaded += thisNumLoaded;
+ lastAppliedTxId += thisNumLoaded;
+ editIn.close();
+ }
+
+ // update the counts
+ getFSNamesystem().dir.updateCountForINodeWithQuota();
+ return numLoaded;
+ }
+
+
+ /**
+ * Load the image namespace from the given image file, verifying
+ * it against the MD5 sum stored in its associated .md5 file.
+ */
+ private void loadFSImage(File imageFile) throws IOException {
+ MD5Hash expectedMD5 = MD5FileUtils.readStoredMd5ForFile(imageFile);
+ if (expectedMD5 == null) {
+ throw new IOException("No MD5 file found corresponding to image file "
+ + imageFile);
+ }
+ loadFSImage(imageFile, expectedMD5);
+ }
+
/**
* Load in the filesystem image from file. It's a big list of
* filenames and blocks. Return whether we should
* "re-save" and consolidate the edit-logs
*/
- boolean loadFSImage(File curFile) throws IOException {
+ private void loadFSImage(File curFile, MD5Hash expectedMd5) throws IOException {
FSImageFormat.Loader loader = new FSImageFormat.Loader(
conf, getFSNamesystem());
loader.load(curFile);
@@ -724,63 +718,32 @@ boolean loadFSImage(File curFile) throws IOException {
// Check that the image digest we loaded matches up with what
// we expected
MD5Hash readImageMd5 = loader.getLoadedImageMd5();
- if (storage.getImageDigest() == null) {
- storage.setImageDigest(readImageMd5); // set this fsimage's checksum
- } else if (!storage.getImageDigest().equals(readImageMd5)) {
+ if (expectedMd5 != null &&
+ !expectedMd5.equals(readImageMd5)) {
throw new IOException("Image file " + curFile +
" is corrupt with MD5 checksum of " + readImageMd5 +
- " but expecting " + storage.getImageDigest());
+ " but expecting " + expectedMd5);
}
- storage.namespaceID = loader.getLoadedNamespaceID();
- storage.layoutVersion = loader.getLoadedImageVersion();
-
- boolean needToSave =
- loader.getLoadedImageVersion() != FSConstants.LAYOUT_VERSION;
- return needToSave;
- }
-
- /**
- * Load and merge edits from two edits files
- *
- * @param sd storage directory
- * @return number of edits loaded
- * @throws IOException
- */
- int loadFSEdits(StorageDirectory sd) throws IOException {
- FSEditLogLoader loader = new FSEditLogLoader(namesystem);
-
- int numEdits = 0;
- EditLogFileInputStream edits =
- new EditLogFileInputStream(NNStorage.getStorageFile(sd,
- NameNodeFile.EDITS));
-
- numEdits = loader.loadFSEdits(edits);
- edits.close();
- File editsNew = NNStorage.getStorageFile(sd, NameNodeFile.EDITS_NEW);
-
- if (editsNew.exists() && editsNew.length() > 0) {
- edits = new EditLogFileInputStream(editsNew);
- numEdits += loader.loadFSEdits(edits);
- edits.close();
- }
- if (numEdits == 0 && editsNew.exists()) {
- numEdits++;
- }
- // update the counts.
- getFSNamesystem().dir.updateCountForINodeWithQuota();
-
- return numEdits;
+ long txId = loader.getLoadedImageTxId();
+ LOG.info("Loaded image for txid " + txId + " from " + curFile);
+ lastAppliedTxId = txId;
+ storage.setMostRecentCheckpointTxId(txId);
}
/**
* Save the contents of the FS image to the file.
*/
- void saveFSImage(File newFile) throws IOException {
+ void saveFSImage(StorageDirectory sd, long txid) throws IOException {
+ File newFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
+ File dstFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid);
+
FSImageFormat.Saver saver = new FSImageFormat.Saver();
FSImageCompression compression = FSImageCompression.createCompression(conf);
- saver.save(newFile, getFSNamesystem(), compression);
- storage.setImageDigest(saver.getSavedDigest());
+ saver.save(newFile, txid, getFSNamesystem(), compression);
+
+ MD5FileUtils.saveMD5File(dstFile, saver.getSavedDigest());
+ storage.setMostRecentCheckpointTxId(txid);
}
/**
@@ -797,15 +760,17 @@ void saveFSImage(File newFile) throws IOException {
private class FSImageSaver implements Runnable {
private StorageDirectory sd;
private List errorSDs;
+ private final long txid;
- FSImageSaver(StorageDirectory sd, List errorSDs) {
+ FSImageSaver(StorageDirectory sd, List errorSDs, long txid) {
this.sd = sd;
this.errorSDs = errorSDs;
+ this.txid = txid;
}
public void run() {
try {
- saveCurrent(sd);
+ saveFSImage(sd, txid);
} catch (Throwable t) {
LOG.error("Unable to save image for " + sd.getRoot(), t);
errorSDs.add(sd);
@@ -831,295 +796,125 @@ private void waitForThreads(List threads) {
}
}
/**
- * Save the contents of the FS image and create empty edits.
- *
- * In order to minimize the recovery effort in case of failure during
- * saveNamespace the algorithm reduces discrepancy between directory states
- * by performing updates in the following order:
- *
- * - rename current to lastcheckpoint.tmp for all of them,
- * - save image and recreate edits for all of them,
- * - rename lastcheckpoint.tmp to previous.checkpoint.
- *
- * On stage (2) we first save all images, then recreate edits.
- * Otherwise the name-node may purge all edits and fail,
- * in which case the journal will be lost.
+ * Save the contents of the FS image to a new image file in each of the
+ * current storage directories.
*/
- void saveNamespace(boolean renewCheckpointTime) throws IOException {
-
- // try to restore all failed edit logs here
+ void saveNamespace() throws IOException {
assert editLog != null : "editLog must be initialized";
storage.attemptRestoreRemovedStorage();
- editLog.close();
- if(renewCheckpointTime)
- storage.setCheckpointTime(now());
- List errorSDs =
- Collections.synchronizedList(new ArrayList());
-
- // mv current -> lastcheckpoint.tmp
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- try {
- storage.moveCurrent(sd);
- } catch(IOException ie) {
- LOG.error("Unable to move current for " + sd.getRoot(), ie);
- errorSDs.add(sd);
+ boolean editLogWasOpen = editLog.isOpen();
+
+ if (editLogWasOpen) {
+ editLog.endCurrentLogSegment(true);
+ }
+ long imageTxId = editLog.getLastWrittenTxId();
+ try {
+ saveFSImageInAllDirs(imageTxId);
+ storage.writeAll();
+ } finally {
+ if (editLogWasOpen) {
+ editLog.startLogSegment(imageTxId + 1, true);
+ // Take this opportunity to note the current transaction
+ storage.writeTransactionIdFileToStorage(imageTxId + 1);
}
}
+
+ }
+
+ protected void saveFSImageInAllDirs(long txid) throws IOException {
+ if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
+ throw new IOException("No image directories available!");
+ }
+
+ List errorSDs =
+ Collections.synchronizedList(new ArrayList());
List saveThreads = new ArrayList();
// save images into current
for (Iterator it
= storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
StorageDirectory sd = it.next();
- if (errorSDs.contains(sd)) {
- continue;
- }
- try {
- FSImageSaver saver = new FSImageSaver(sd, errorSDs);
- Thread saveThread = new Thread(saver, saver.toString());
- saveThreads.add(saveThread);
- saveThread.start();
- } catch (Exception e) {
- LOG.error("Failed save to image directory " + sd.getRoot(), e);
- errorSDs.add(sd);
- continue;
- }
+ FSImageSaver saver = new FSImageSaver(sd, errorSDs, txid);
+ Thread saveThread = new Thread(saver, saver.toString());
+ saveThreads.add(saveThread);
+ saveThread.start();
}
waitForThreads(saveThreads);
saveThreads.clear();
+ storage.reportErrorsOnDirectories(errorSDs);
- // -NOTE-
- // If NN has image-only and edits-only storage directories and fails here
- // the image will have the latest namespace state.
- // During startup the image-only directories will recover by discarding
- // lastcheckpoint.tmp, while
- // the edits-only directories will recover by falling back
- // to the old state contained in their lastcheckpoint.tmp.
- // The edits directories should be discarded during startup because their
- // checkpointTime is older than that of image directories.
- // recreate edits in current
- for (Iterator it
- = storage.dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
- StorageDirectory sd = it.next();
- if (errorSDs.contains(sd)) {
- continue;
- }
-
- // if this directory already stores the image and edits, then it was
- // already processed in the earlier loop.
- if (sd.getStorageDirType() == NameNodeDirType.IMAGE_AND_EDITS) {
- continue;
- }
-
- try {
- FSImageSaver saver = new FSImageSaver(sd, errorSDs);
- Thread saveThread = new Thread(saver, saver.toString());
- saveThreads.add(saveThread);
- saveThread.start();
- } catch (Exception e) {
- LOG.error("Failed save to edits directory " + sd.getRoot(), e);
- errorSDs.add(sd);
- continue;
- }
+ if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
+ throw new IOException(
+ "Failed to save in any storage directories while saving namespace.");
}
- waitForThreads(saveThreads);
- // mv lastcheckpoint.tmp -> previous.checkpoint
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- if (errorSDs.contains(sd)) {
- continue;
- }
- try {
- storage.moveLastCheckpoint(sd);
- } catch(IOException ie) {
- LOG.error("Unable to move last checkpoint for " + sd.getRoot(), ie);
- errorSDs.add(sd);
- continue;
- }
- }
+ renameCheckpoint(txid);
+ // Since we now have a new checkpoint, we can clean up some
+ // old edit logs and checkpoints.
+ purgeOldStorage();
+ }
+
+ /**
+ * Purge any files in the storage directories that are no longer
+ * necessary.
+ */
+ public void purgeOldStorage() {
try {
- storage.reportErrorsOnDirectories(errorSDs);
-
- // If there was an error in every storage dir, each one will have been
- // removed from the list of storage directories.
- if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0 ||
- storage.getNumStorageDirs(NameNodeDirType.EDITS) == 0) {
- throw new IOException("Failed to save any storage directories while saving namespace");
- }
-
- if(!editLog.isOpen()) editLog.open();
- } finally {
- ckptState = CheckpointStates.UPLOAD_DONE;
+ archivalManager.purgeOldStorage();
+ } catch (Exception e) {
+ LOG.warn("Unable to purge old storage", e);
}
}
- /**
- * Save current image and empty journal into {@code current} directory.
- */
- protected void saveCurrent(StorageDirectory sd) throws IOException {
- if (storage.getLayoutVersion() != FSConstants.LAYOUT_VERSION) {
- throw new IllegalStateException(
- "NN with storage version " + FSConstants.LAYOUT_VERSION +
- "cannot save an image with version " + storage.getLayoutVersion());
- }
- File curDir = sd.getCurrentDir();
- NameNodeDirType dirType = (NameNodeDirType)sd.getStorageDirType();
- // save new image or new edits
- if (!curDir.exists() && !curDir.mkdir())
- throw new IOException("Cannot create directory " + curDir);
- if (dirType.isOfType(NameNodeDirType.IMAGE))
- saveFSImage(NNStorage.getStorageFile(sd, NameNodeFile.IMAGE));
- if (dirType.isOfType(NameNodeDirType.EDITS))
- editLog.createEditLogFile(NNStorage.getStorageFile(sd,
- NameNodeFile.EDITS));
- // write version and time files
- storage.writeProperties(sd);
- }
-
-
- /**
- * Moves fsimage.ckpt to fsImage and edits.new to edits
- * Reopens the new edits file.
- */
- void rollFSImage(CheckpointSignature sig,
- boolean renewCheckpointTime) throws IOException {
- sig.validateStorageInfo(this);
- rollFSImage(true);
- }
-
- private void rollFSImage(boolean renewCheckpointTime)
- throws IOException {
- if (ckptState != CheckpointStates.UPLOAD_DONE
- && !(ckptState == CheckpointStates.ROLLED_EDITS
- && storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0)) {
- throw new IOException("Cannot roll fsImage before rolling edits log.");
- }
-
- for (Iterator it
- = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
- StorageDirectory sd = it.next();
- File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW);
- if (!ckpt.exists()) {
- throw new IOException("Checkpoint file " + ckpt +
- " does not exist");
- }
- }
- editLog.purgeEditLog(); // renamed edits.new to edits
- if(LOG.isDebugEnabled()) {
- LOG.debug("rollFSImage after purgeEditLog: storageList="
- + storage.listStorageDirectories());
- }
- //
- // Renames new image
- //
- renameCheckpoint();
- resetVersion(renewCheckpointTime, newImageDigest);
- }
-
/**
* Renames new image
*/
- void renameCheckpoint() throws IOException {
+ private void renameCheckpoint(long txid) throws IOException {
ArrayList al = null;
- for (Iterator it
- = storage.dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
- StorageDirectory sd = it.next();
- File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW);
- File curFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE);
- // renameTo fails on Windows if the destination file
- // already exists.
- if(LOG.isDebugEnabled()) {
- LOG.debug("renaming " + ckpt.getAbsolutePath()
- + " to " + curFile.getAbsolutePath());
- }
- if (!ckpt.renameTo(curFile)) {
- if (!curFile.delete() || !ckpt.renameTo(curFile)) {
- LOG.warn("renaming " + ckpt.getAbsolutePath() + " to " +
- curFile.getAbsolutePath() + " FAILED");
- if(al == null) al = new ArrayList (1);
- al.add(sd);
+ for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
+ try {
+ renameCheckpointInDir(sd, txid);
+ } catch (IOException ioe) {
+ LOG.warn("Unable to rename checkpoint in " + sd, ioe);
+ if (al == null) {
+ al = Lists.newArrayList();
}
+ al.add(sd);
}
}
if(al != null) storage.reportErrorsOnDirectories(al);
}
- /**
- * Updates version and fstime files in all directories (fsimage and edits).
- */
- void resetVersion(boolean renewCheckpointTime, MD5Hash newImageDigest)
+ private void renameCheckpointInDir(StorageDirectory sd, long txid)
throws IOException {
- storage.layoutVersion = FSConstants.LAYOUT_VERSION;
- if(renewCheckpointTime)
- storage.setCheckpointTime(now());
- storage.setImageDigest(newImageDigest);
-
- ArrayList al = null;
- for (Iterator it = storage.dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
- // delete old edits if sd is the image only the directory
- if (!sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
- File editsFile = NNStorage.getStorageFile(sd, NameNodeFile.EDITS);
- if(editsFile.exists() && !editsFile.delete())
- throw new IOException("Cannot delete edits file "
- + editsFile.getCanonicalPath());
- }
- // delete old fsimage if sd is the edits only the directory
- if (!sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) {
- File imageFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE);
- if(imageFile.exists() && !imageFile.delete())
- throw new IOException("Cannot delete image file "
- + imageFile.getCanonicalPath());
- }
- try {
- storage.writeProperties(sd);
- } catch (IOException e) {
- LOG.error("Cannot write file " + sd.getRoot(), e);
-
- if(al == null) al = new ArrayList (1);
- al.add(sd);
- }
+ File ckpt = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE_NEW, txid);
+ File curFile = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE, txid);
+ // renameTo fails on Windows if the destination file
+ // already exists.
+ if(LOG.isDebugEnabled()) {
+ LOG.debug("renaming " + ckpt.getAbsolutePath()
+ + " to " + curFile.getAbsolutePath());
}
- if(al != null) storage.reportErrorsOnDirectories(al);
- ckptState = FSImage.CheckpointStates.START;
+ if (!ckpt.renameTo(curFile)) {
+ if (!curFile.delete() || !ckpt.renameTo(curFile)) {
+ throw new IOException("renaming " + ckpt.getAbsolutePath() + " to " +
+ curFile.getAbsolutePath() + " FAILED");
+ }
+ }
}
CheckpointSignature rollEditLog() throws IOException {
getEditLog().rollEditLog();
- ckptState = CheckpointStates.ROLLED_EDITS;
- // If checkpoint fails this should be the most recent image, therefore
- storage.incrementCheckpointTime();
+ // Record this log segment ID in all of the storage directories, so
+ // we won't miss this log segment on a restart if the edits directories
+ // go missing.
+ storage.writeTransactionIdFileToStorage(getEditLog().getCurSegmentTxId());
return new CheckpointSignature(this);
}
- /**
- * This is called just before a new checkpoint is uploaded to the
- * namenode.
- */
- void validateCheckpointUpload(CheckpointSignature sig) throws IOException {
- if (ckptState != CheckpointStates.ROLLED_EDITS) {
- throw new IOException("Namenode is not expecting an new image " +
- ckptState);
- }
- // verify token
- long modtime = getEditLog().getFsEditTime();
- if (sig.editsTime != modtime) {
- throw new IOException("Namenode has an edit log with timestamp of " +
- DATE_FORM.format(new Date(modtime)) +
- " but new checkpoint was created using editlog " +
- " with timestamp " +
- DATE_FORM.format(new Date(sig.editsTime)) +
- ". Checkpoint Aborted.");
- }
- sig.validateStorageInfo(this);
- ckptState = FSImage.CheckpointStates.UPLOAD_START;
- }
-
/**
* Start checkpoint.
*
@@ -1150,34 +945,23 @@ else if(bnReg.isRole(NamenodeRole.NAMENODE))
+ " role " + bnReg.getRole() + ": checkpoint is not allowed.";
else if(bnReg.getLayoutVersion() < storage.getLayoutVersion()
|| (bnReg.getLayoutVersion() == storage.getLayoutVersion()
- && bnReg.getCTime() > storage.getCTime())
- || (bnReg.getLayoutVersion() == storage.getLayoutVersion()
- && bnReg.getCTime() == storage.getCTime()
- && bnReg.getCheckpointTime() > storage.getCheckpointTime()))
+ && bnReg.getCTime() > storage.getCTime()))
// remote node has newer image age
msg = "Name node " + bnReg.getAddress()
+ " has newer image layout version: LV = " +bnReg.getLayoutVersion()
+ " cTime = " + bnReg.getCTime()
- + " checkpointTime = " + bnReg.getCheckpointTime()
+ ". Current version: LV = " + storage.getLayoutVersion()
- + " cTime = " + storage.getCTime()
- + " checkpointTime = " + storage.getCheckpointTime();
+ + " cTime = " + storage.getCTime();
if(msg != null) {
LOG.error(msg);
return new NamenodeCommand(NamenodeProtocol.ACT_SHUTDOWN);
}
- boolean isImgObsolete = true;
- if(bnReg.getLayoutVersion() == storage.getLayoutVersion()
- && bnReg.getCTime() == storage.getCTime()
- && bnReg.getCheckpointTime() == storage.getCheckpointTime())
- isImgObsolete = false;
boolean needToReturnImg = true;
if(storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0)
// do not return image if there are no image directories
needToReturnImg = false;
CheckpointSignature sig = rollEditLog();
- getEditLog().logJSpoolStart(bnReg, nnReg);
- return new CheckpointCommand(sig, isImgObsolete, needToReturnImg);
+ return new CheckpointCommand(sig, needToReturnImg);
}
/**
@@ -1196,32 +980,41 @@ else if(bnReg.getLayoutVersion() < storage.getLayoutVersion()
void endCheckpoint(CheckpointSignature sig,
NamenodeRole remoteNNRole) throws IOException {
sig.validateStorageInfo(this);
- // Renew checkpoint time for the active if the other is a checkpoint-node.
- // The checkpoint-node should have older image for the next checkpoint
- // to take effect.
- // The backup-node always has up-to-date image and will have the same
- // checkpoint time as the active node.
- boolean renewCheckpointTime = remoteNNRole.equals(NamenodeRole.CHECKPOINT);
- rollFSImage(sig, renewCheckpointTime);
- }
-
- CheckpointStates getCheckpointState() {
- return ckptState;
- }
-
- void setCheckpointState(CheckpointStates cs) {
- ckptState = cs;
}
/**
- * This is called when a checkpoint upload finishes successfully.
+ * This is called by the 2NN after having downloaded an image, and by
+ * the NN after having received a new image from the 2NN. It
+ * renames the image from fsimage_N.ckpt to fsimage_N and also
+ * saves the related .md5 file into place.
*/
- synchronized void checkpointUploadDone() {
- ckptState = CheckpointStates.UPLOAD_DONE;
+ synchronized void saveDigestAndRenameCheckpointImage(
+ long txid, MD5Hash digest) throws IOException {
+ renameCheckpoint(txid);
+ List badSds = Lists.newArrayList();
+
+ for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
+ File imageFile = NNStorage.getImageFile(sd, txid);
+ try {
+ MD5FileUtils.saveMD5File(imageFile, digest);
+ } catch (IOException ioe) {
+ badSds.add(sd);
+ }
+ }
+ storage.reportErrorsOnDirectories(badSds);
+
+ // So long as this is the newest image available,
+ // advertise it as such to other checkpointers
+ // from now on
+ if (txid > storage.getMostRecentCheckpointTxId()) {
+ storage.setMostRecentCheckpointTxId(txid);
+ }
}
synchronized public void close() throws IOException {
- getEditLog().close();
+ if (editLog != null) { // 2NN doesn't have any edit log
+ getEditLog().close();
+ }
storage.close();
}
@@ -1257,30 +1050,6 @@ public NNStorage getStorage() {
return storage;
}
- @Override // NNStorageListener
- public void errorOccurred(StorageDirectory sd) throws IOException {
- // do nothing,
- }
-
- @Override // NNStorageListener
- public void formatOccurred(StorageDirectory sd) throws IOException {
- if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) {
- sd.lock();
- try {
- saveCurrent(sd);
- } finally {
- sd.unlock();
- }
- LOG.info("Storage directory " + sd.getRoot()
- + " has been successfully formatted.");
- }
- };
-
- @Override // NNStorageListener
- public void directoryAvailable(StorageDirectory sd) throws IOException {
- // do nothing
- }
-
public int getLayoutVersion() {
return storage.getLayoutVersion();
}
@@ -1296,4 +1065,8 @@ public String getClusterID() {
public String getBlockPoolID() {
return storage.getBlockPoolID();
}
+
+ public synchronized long getLastAppliedTxId() {
+ return lastAppliedTxId;
+ }
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
index 46d08970b32..5bad6be91dd 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java
@@ -44,6 +44,7 @@
import org.apache.hadoop.hdfs.protocol.LayoutVersion.Feature;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
+import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
@@ -71,10 +72,8 @@ static class Loader {
/** Set to true once a file has been loaded using this loader. */
private boolean loaded = false;
- /** The image version of the loaded file */
- private int imgVersion;
- /** The namespace ID of the loaded file */
- private int imgNamespaceID;
+ /** The transaction ID of the last edit represented by the loaded file */
+ private long imgTxId;
/** The MD5 sum of the loaded file */
private MD5Hash imgDigest;
@@ -83,15 +82,6 @@ static class Loader {
this.namesystem = namesystem;
}
- /**
- * Return the version number of the image that has been loaded.
- * @throws IllegalStateException if load() has not yet been called.
- */
- int getLoadedImageVersion() {
- checkLoaded();
- return imgVersion;
- }
-
/**
* Return the MD5 checksum of the image that has been loaded.
* @throws IllegalStateException if load() has not yet been called.
@@ -101,13 +91,9 @@ MD5Hash getLoadedImageMd5() {
return imgDigest;
}
- /**
- * Return the namespace ID of the image that has been loaded.
- * @throws IllegalStateException if load() has not yet been called.
- */
- int getLoadedNamespaceID() {
+ long getLoadedImageTxId() {
checkLoaded();
- return imgNamespaceID;
+ return imgTxId;
}
/**
@@ -156,10 +142,14 @@ void load(File curFile)
* it should not contain version and namespace fields
*/
// read image version: first appeared in version -1
- imgVersion = in.readInt();
+ int imgVersion = in.readInt();
+ if(getLayoutVersion() != imgVersion)
+ throw new InconsistentFSStateException(curFile,
+ "imgVersion " + imgVersion +
+ " expected to be " + getLayoutVersion());
// read namespaceID: first appeared in version -2
- imgNamespaceID = in.readInt();
+ in.readInt();
// read number of files
long numFiles = readNumFiles(in);
@@ -169,6 +159,15 @@ void load(File curFile)
long genstamp = in.readLong();
namesystem.setGenerationStamp(genstamp);
}
+
+ // read the transaction ID of the last edit represented by
+ // this image
+ if (LayoutVersion.supports(Feature.STORED_TXIDS, imgVersion)) {
+ imgTxId = in.readLong();
+ } else {
+ imgTxId = 0;
+ }
+
// read compression related info
FSImageCompression compression;
@@ -234,7 +233,7 @@ private void updateRootAttr(INode root) {
private void loadLocalNameINodes(long numFiles, DataInputStream in)
throws IOException {
assert LayoutVersion.supports(Feature.FSIMAGE_NAME_OPTIMIZATION,
- imgVersion);
+ getLayoutVersion());
assert numFiles > 0;
// load root
@@ -324,11 +323,12 @@ private void loadFullNameINodes(long numFiles,
* @return an inode
*/
private INode loadINode(DataInputStream in)
- throws IOException {
+ throws IOException {
long modificationTime = 0;
long atime = 0;
long blockSize = 0;
+ int imgVersion = getLayoutVersion();
short replication = in.readShort();
replication = namesystem.adjustReplication(replication);
modificationTime = in.readLong();
@@ -396,7 +396,10 @@ private INode loadINode(DataInputStream in)
modificationTime, atime, nsQuota, dsQuota, blockSize);
}
- private void loadDatanodes(DataInputStream in) throws IOException {
+ private void loadDatanodes(DataInputStream in)
+ throws IOException {
+ int imgVersion = getLayoutVersion();
+
if (imgVersion > -3) // pre datanode image version
return;
if (imgVersion <= -12) {
@@ -412,6 +415,7 @@ private void loadDatanodes(DataInputStream in) throws IOException {
private void loadFilesUnderConstruction(DataInputStream in)
throws IOException {
FSDirectory fsDir = namesystem.dir;
+ int imgVersion = getLayoutVersion();
if (imgVersion > -13) // pre lease image version
return;
int size = in.readInt();
@@ -437,7 +441,10 @@ private void loadFilesUnderConstruction(DataInputStream in)
}
}
- private void loadSecretManagerState(DataInputStream in) throws IOException {
+ private void loadSecretManagerState(DataInputStream in)
+ throws IOException {
+ int imgVersion = getLayoutVersion();
+
if (!LayoutVersion.supports(Feature.DELEGATION_TOKEN, imgVersion)) {
//SecretManagerState is not available.
//This must not happen if security is turned on.
@@ -446,8 +453,14 @@ private void loadSecretManagerState(DataInputStream in) throws IOException {
namesystem.loadSecretManagerState(in);
}
+ private int getLayoutVersion() {
+ return namesystem.getFSImage().getStorage().getLayoutVersion();
+ }
+
+ private long readNumFiles(DataInputStream in)
+ throws IOException {
+ int imgVersion = getLayoutVersion();
- private long readNumFiles(DataInputStream in) throws IOException {
if (LayoutVersion.supports(Feature.NAMESPACE_QUOTA, imgVersion)) {
return in.readLong();
} else {
@@ -526,6 +539,7 @@ MD5Hash getSavedDigest() {
}
void save(File newFile,
+ long txid,
FSNamesystem sourceNamesystem,
FSImageCompression compression)
throws IOException {
@@ -542,9 +556,11 @@ void save(File newFile,
DataOutputStream out = new DataOutputStream(fos);
try {
out.writeInt(FSConstants.LAYOUT_VERSION);
- out.writeInt(sourceNamesystem.getFSImage().getStorage().getNamespaceID()); // TODO bad dependency
+ out.writeInt(sourceNamesystem.getFSImage()
+ .getStorage().getNamespaceID()); // TODO bad dependency
out.writeLong(fsDir.rootDir.numItemsInTree());
out.writeLong(sourceNamesystem.getGenerationStamp());
+ out.writeLong(txid);
// write compression info and set up compressed stream
out = compression.writeHeaderAndWrapStream(fos);
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java
new file mode 100644
index 00000000000..cec2eeff2d8
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImagePreTransactionalStorageInspector.java
@@ -0,0 +1,272 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
+
+/**
+ * Inspects a FSImage storage directory in the "old" (pre-HDFS-1073) format.
+ * This format has the following data files:
+ * - fsimage
+ * - fsimage.ckpt (when checkpoint is being uploaded)
+ * - edits
+ * - edits.new (when logs are "rolled")
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+class FSImagePreTransactionalStorageInspector extends FSImageStorageInspector {
+ private static final Log LOG =
+ LogFactory.getLog(FSImagePreTransactionalStorageInspector.class);
+
+ /* Flag if there is at least one storage dir that doesn't contain the newest
+ * fstime */
+ private boolean hasOutOfDateStorageDirs = false;
+ /* Flag set false if there are any "previous" directories found */
+ private boolean isUpgradeFinalized = true;
+
+ // Track the name and edits dir with the latest times
+ private long latestNameCheckpointTime = Long.MIN_VALUE;
+ private long latestEditsCheckpointTime = Long.MIN_VALUE;
+ private StorageDirectory latestNameSD = null;
+ private StorageDirectory latestEditsSD = null;
+
+ /** Set to determine if all of storageDirectories share the same checkpoint */
+ Set checkpointTimes = new HashSet();
+
+ private List imageDirs = new ArrayList();
+ private List editsDirs = new ArrayList();
+
+ @Override
+ void inspectDirectory(StorageDirectory sd) throws IOException {
+ // Was the file just formatted?
+ if (!sd.getVersionFile().exists()) {
+ hasOutOfDateStorageDirs = true;
+ return;
+ }
+
+ boolean imageExists = false;
+ boolean editsExists = false;
+
+ // Determine if sd is image, edits or both
+ if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) {
+ imageExists = NNStorage.getStorageFile(sd, NameNodeFile.IMAGE).exists();
+ imageDirs.add(sd.getRoot().getCanonicalPath());
+ }
+
+ if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
+ editsExists = NNStorage.getStorageFile(sd, NameNodeFile.EDITS).exists();
+ editsDirs.add(sd.getRoot().getCanonicalPath());
+ }
+
+ long checkpointTime = readCheckpointTime(sd);
+
+ checkpointTimes.add(checkpointTime);
+
+ if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE) &&
+ (latestNameCheckpointTime < checkpointTime) && imageExists) {
+ latestNameCheckpointTime = checkpointTime;
+ latestNameSD = sd;
+ }
+
+ if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS) &&
+ (latestEditsCheckpointTime < checkpointTime) && editsExists) {
+ latestEditsCheckpointTime = checkpointTime;
+ latestEditsSD = sd;
+ }
+
+ // check that we have a valid, non-default checkpointTime
+ if (checkpointTime <= 0L)
+ hasOutOfDateStorageDirs = true;
+
+ // set finalized flag
+ isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists();
+ }
+
+ /**
+ * Determine the checkpoint time of the specified StorageDirectory
+ *
+ * @param sd StorageDirectory to check
+ * @return If file exists and can be read, last checkpoint time. If not, 0L.
+ * @throws IOException On errors processing file pointed to by sd
+ */
+ static long readCheckpointTime(StorageDirectory sd) throws IOException {
+ File timeFile = NNStorage.getStorageFile(sd, NameNodeFile.TIME);
+ long timeStamp = 0L;
+ if (timeFile.exists() && timeFile.canRead()) {
+ DataInputStream in = new DataInputStream(new FileInputStream(timeFile));
+ try {
+ timeStamp = in.readLong();
+ } finally {
+ in.close();
+ }
+ }
+ return timeStamp;
+ }
+
+ @Override
+ boolean isUpgradeFinalized() {
+ return isUpgradeFinalized;
+ }
+
+ @Override
+ LoadPlan createLoadPlan() throws IOException {
+ // We should have at least one image and one edits dirs
+ if (latestNameSD == null)
+ throw new IOException("Image file is not found in " + imageDirs);
+ if (latestEditsSD == null)
+ throw new IOException("Edits file is not found in " + editsDirs);
+
+ // Make sure we are loading image and edits from same checkpoint
+ if (latestNameCheckpointTime > latestEditsCheckpointTime
+ && latestNameSD != latestEditsSD
+ && latestNameSD.getStorageDirType() == NameNodeDirType.IMAGE
+ && latestEditsSD.getStorageDirType() == NameNodeDirType.EDITS) {
+ // This is a rare failure when NN has image-only and edits-only
+ // storage directories, and fails right after saving images,
+ // in some of the storage directories, but before purging edits.
+ // See -NOTE- in saveNamespace().
+ LOG.error("This is a rare failure scenario!!!");
+ LOG.error("Image checkpoint time " + latestNameCheckpointTime +
+ " > edits checkpoint time " + latestEditsCheckpointTime);
+ LOG.error("Name-node will treat the image as the latest state of " +
+ "the namespace. Old edits will be discarded.");
+ } else if (latestNameCheckpointTime != latestEditsCheckpointTime) {
+ throw new IOException("Inconsistent storage detected, " +
+ "image and edits checkpoint times do not match. " +
+ "image checkpoint time = " + latestNameCheckpointTime +
+ "edits checkpoint time = " + latestEditsCheckpointTime);
+ }
+
+ return new PreTransactionalLoadPlan();
+ }
+
+ @Override
+ boolean needToSave() {
+ return hasOutOfDateStorageDirs ||
+ checkpointTimes.size() != 1 ||
+ latestNameCheckpointTime > latestEditsCheckpointTime;
+
+ }
+
+ private class PreTransactionalLoadPlan extends LoadPlan {
+
+ @Override
+ boolean doRecovery() throws IOException {
+ LOG.debug(
+ "Performing recovery in "+ latestNameSD + " and " + latestEditsSD);
+
+ boolean needToSave = false;
+ File curFile =
+ NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE);
+ File ckptFile =
+ NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE_NEW);
+
+ //
+ // If we were in the midst of a checkpoint
+ //
+ if (ckptFile.exists()) {
+ needToSave = true;
+ if (NNStorage.getStorageFile(latestEditsSD, NameNodeFile.EDITS_NEW)
+ .exists()) {
+ //
+ // checkpointing migth have uploaded a new
+ // merged image, but we discard it here because we are
+ // not sure whether the entire merged image was uploaded
+ // before the namenode crashed.
+ //
+ if (!ckptFile.delete()) {
+ throw new IOException("Unable to delete " + ckptFile);
+ }
+ } else {
+ //
+ // checkpointing was in progress when the namenode
+ // shutdown. The fsimage.ckpt was created and the edits.new
+ // file was moved to edits. We complete that checkpoint by
+ // moving fsimage.new to fsimage. There is no need to
+ // update the fstime file here. renameTo fails on Windows
+ // if the destination file already exists.
+ //
+ if (!ckptFile.renameTo(curFile)) {
+ if (!curFile.delete())
+ LOG.warn("Unable to delete dir " + curFile + " before rename");
+ if (!ckptFile.renameTo(curFile)) {
+ throw new IOException("Unable to rename " + ckptFile +
+ " to " + curFile);
+ }
+ }
+ }
+ }
+ return needToSave;
+ }
+
+ @Override
+ File getImageFile() {
+ return NNStorage.getStorageFile(latestNameSD, NameNodeFile.IMAGE);
+ }
+
+ @Override
+ List getEditsFiles() {
+ if (latestNameCheckpointTime > latestEditsCheckpointTime) {
+ // the image is already current, discard edits
+ LOG.debug(
+ "Name checkpoint time is newer than edits, not loading edits.");
+ return Collections.emptyList();
+ }
+
+ return getEditsInStorageDir(latestEditsSD);
+ }
+
+ @Override
+ StorageDirectory getStorageDirectoryForProperties() {
+ return latestNameSD;
+ }
+ }
+
+ /**
+ * @return a list with the paths to EDITS and EDITS_NEW (if it exists)
+ * in a given storage directory.
+ */
+ static List getEditsInStorageDir(StorageDirectory sd) {
+ ArrayList files = new ArrayList();
+ File edits = NNStorage.getStorageFile(sd, NameNodeFile.EDITS);
+ assert edits.exists() : "Expected edits file at " + edits;
+ files.add(edits);
+ File editsNew = NNStorage.getStorageFile(sd, NameNodeFile.EDITS_NEW);
+ if (editsNew.exists()) {
+ files.add(editsNew);
+ }
+ return files;
+ }
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java
new file mode 100644
index 00000000000..6249f2f5d5d
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageStorageInspector.java
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+
+/**
+ * Interface responsible for inspecting a set of storage directories and devising
+ * a plan to load the namespace from them.
+ */
+@InterfaceAudience.Private
+@InterfaceStability.Unstable
+abstract class FSImageStorageInspector {
+ /**
+ * Inspect the contents of the given storage directory.
+ */
+ abstract void inspectDirectory(StorageDirectory sd) throws IOException;
+
+ /**
+ * @return false if any of the storage directories have an unfinalized upgrade
+ */
+ abstract boolean isUpgradeFinalized();
+
+ /**
+ * Create a plan to load the image from the set of inspected storage directories.
+ * @throws IOException if not enough files are available (eg no image found in any directory)
+ */
+ abstract LoadPlan createLoadPlan() throws IOException;
+
+ /**
+ * @return true if the directories are in such a state that the image should be re-saved
+ * following the load
+ */
+ abstract boolean needToSave();
+
+ /**
+ * A plan to load the namespace from disk, providing the locations from which to load
+ * the image and a set of edits files.
+ */
+ abstract static class LoadPlan {
+ /**
+ * Execute atomic move sequence in the chosen storage directories,
+ * in order to recover from an interrupted checkpoint.
+ * @return true if some recovery action was taken
+ */
+ abstract boolean doRecovery() throws IOException;
+
+ /**
+ * @return the file from which to load the image data
+ */
+ abstract File getImageFile();
+
+ /**
+ * @return a list of flies containing edits to replay
+ */
+ abstract List getEditsFiles();
+
+ /**
+ * @return the storage directory containing the VERSION file that should be
+ * loaded.
+ */
+ abstract StorageDirectory getStorageDirectoryForProperties();
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("Will load image file: ").append(getImageFile()).append("\n");
+ sb.append("Will load edits files:").append("\n");
+ for (File f : getEditsFiles()) {
+ sb.append(" ").append(f).append("\n");
+ }
+ sb.append("Will load metadata from: ")
+ .append(getStorageDirectoryForProperties())
+ .append("\n");
+ return sb.toString();
+ }
+ }
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
new file mode 100644
index 00000000000..8a2c2aa4db9
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSImageTransactionalStorageInspector.java
@@ -0,0 +1,688 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdfs.protocol.FSConstants;
+import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.server.namenode.FSEditLogLoader.EditLogValidation;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
+import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
+
+import com.google.common.base.Joiner;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+class FSImageTransactionalStorageInspector extends FSImageStorageInspector {
+ public static final Log LOG = LogFactory.getLog(
+ FSImageTransactionalStorageInspector.class);
+
+ private boolean needToSave = false;
+ private boolean isUpgradeFinalized = true;
+
+ List foundImages = new ArrayList();
+ List foundEditLogs = new ArrayList();
+ SortedMap logGroups = new TreeMap();
+ long maxSeenTxId = 0;
+
+ private static final Pattern IMAGE_REGEX = Pattern.compile(
+ NameNodeFile.IMAGE.getName() + "_(\\d+)");
+ private static final Pattern EDITS_REGEX = Pattern.compile(
+ NameNodeFile.EDITS.getName() + "_(\\d+)-(\\d+)");
+ private static final Pattern EDITS_INPROGRESS_REGEX = Pattern.compile(
+ NameNodeFile.EDITS_INPROGRESS.getName() + "_(\\d+)");
+
+ @Override
+ public void inspectDirectory(StorageDirectory sd) throws IOException {
+ // Was the directory just formatted?
+ if (!sd.getVersionFile().exists()) {
+ LOG.info("No version file in " + sd.getRoot());
+ needToSave |= true;
+ return;
+ }
+
+ File currentDir = sd.getCurrentDir();
+ File filesInStorage[];
+ try {
+ filesInStorage = FileUtil.listFiles(currentDir);
+ } catch (IOException ioe) {
+ LOG.warn("Unable to inspect storage directory " + currentDir,
+ ioe);
+ return;
+ }
+
+ for (File f : filesInStorage) {
+ LOG.debug("Checking file " + f);
+ String name = f.getName();
+
+ // Check for fsimage_*
+ Matcher imageMatch = IMAGE_REGEX.matcher(name);
+ if (imageMatch.matches()) {
+ if (sd.getStorageDirType().isOfType(NameNodeDirType.IMAGE)) {
+ try {
+ long txid = Long.valueOf(imageMatch.group(1));
+ foundImages.add(new FoundFSImage(sd, f, txid));
+ } catch (NumberFormatException nfe) {
+ LOG.error("Image file " + f + " has improperly formatted " +
+ "transaction ID");
+ // skip
+ }
+ } else {
+ LOG.warn("Found image file at " + f + " but storage directory is " +
+ "not configured to contain images.");
+ }
+ }
+ }
+
+
+ // Check for a seen_txid file, which marks a minimum transaction ID that
+ // must be included in our load plan.
+ try {
+ maxSeenTxId = Math.max(maxSeenTxId, NNStorage.readTransactionIdFile(sd));
+ } catch (IOException ioe) {
+ LOG.warn("Unable to determine the max transaction ID seen by " + sd, ioe);
+ }
+
+ List editLogs = matchEditLogs(filesInStorage);
+ if (sd.getStorageDirType().isOfType(NameNodeDirType.EDITS)) {
+ for (FoundEditLog log : editLogs) {
+ addEditLog(log);
+ }
+ } else if (!editLogs.isEmpty()){
+ LOG.warn("Found the following edit log file(s) in " + sd +
+ " even though it was not configured to store edits:\n" +
+ " " + Joiner.on("\n ").join(editLogs));
+
+ }
+
+ // set finalized flag
+ isUpgradeFinalized = isUpgradeFinalized && !sd.getPreviousDir().exists();
+ }
+
+ static List matchEditLogs(File[] filesInStorage) {
+ List ret = Lists.newArrayList();
+ for (File f : filesInStorage) {
+ String name = f.getName();
+ // Check for edits
+ Matcher editsMatch = EDITS_REGEX.matcher(name);
+ if (editsMatch.matches()) {
+ try {
+ long startTxId = Long.valueOf(editsMatch.group(1));
+ long endTxId = Long.valueOf(editsMatch.group(2));
+ ret.add(new FoundEditLog(f, startTxId, endTxId));
+ } catch (NumberFormatException nfe) {
+ LOG.error("Edits file " + f + " has improperly formatted " +
+ "transaction ID");
+ // skip
+ }
+ }
+
+ // Check for in-progress edits
+ Matcher inProgressEditsMatch = EDITS_INPROGRESS_REGEX.matcher(name);
+ if (inProgressEditsMatch.matches()) {
+ try {
+ long startTxId = Long.valueOf(inProgressEditsMatch.group(1));
+ ret.add(
+ new FoundEditLog(f, startTxId, FoundEditLog.UNKNOWN_END));
+ } catch (NumberFormatException nfe) {
+ LOG.error("In-progress edits file " + f + " has improperly " +
+ "formatted transaction ID");
+ // skip
+ }
+ }
+ }
+ return ret;
+ }
+
+ private void addEditLog(FoundEditLog foundEditLog) {
+ foundEditLogs.add(foundEditLog);
+ LogGroup group = logGroups.get(foundEditLog.startTxId);
+ if (group == null) {
+ group = new LogGroup(foundEditLog.startTxId);
+ logGroups.put(foundEditLog.startTxId, group);
+ }
+ group.add(foundEditLog);
+ }
+
+
+ @Override
+ public boolean isUpgradeFinalized() {
+ return isUpgradeFinalized;
+ }
+
+ /**
+ * @return the image that has the most recent associated transaction ID.
+ * If there are multiple storage directories which contain equal images
+ * the storage directory that was inspected first will be preferred.
+ *
+ * Returns null if no images were found.
+ */
+ FoundFSImage getLatestImage() {
+ FoundFSImage ret = null;
+ for (FoundFSImage img : foundImages) {
+ if (ret == null || img.txId > ret.txId) {
+ ret = img;
+ }
+ }
+ return ret;
+ }
+
+ public List getFoundImages() {
+ return ImmutableList.copyOf(foundImages);
+ }
+
+ public List getFoundEditLogs() {
+ return ImmutableList.copyOf(foundEditLogs);
+ }
+
+ @Override
+ public LoadPlan createLoadPlan() throws IOException {
+ if (foundImages.isEmpty()) {
+ throw new FileNotFoundException("No valid image files found");
+ }
+
+ FoundFSImage recoveryImage = getLatestImage();
+ LogLoadPlan logPlan = createLogLoadPlan(recoveryImage.txId, Long.MAX_VALUE);
+
+ return new TransactionalLoadPlan(recoveryImage,
+ logPlan);
+ }
+
+ /**
+ * Plan which logs to load in order to bring the namespace up-to-date.
+ * Transactions will be considered in the range (sinceTxId, maxTxId]
+ *
+ * @param sinceTxId the highest txid that is already loaded
+ * (eg from the image checkpoint)
+ * @param maxStartTxId ignore any log files that start after this txid
+ */
+ LogLoadPlan createLogLoadPlan(long sinceTxId, long maxStartTxId) throws IOException {
+ long expectedTxId = sinceTxId + 1;
+
+ List recoveryLogs = new ArrayList();
+
+ SortedMap tailGroups = logGroups.tailMap(expectedTxId);
+ if (logGroups.size() > tailGroups.size()) {
+ LOG.debug("Excluded " + (logGroups.size() - tailGroups.size()) +
+ " groups of logs because they start with a txid less than image " +
+ "txid " + sinceTxId);
+ }
+
+ SortedMap usefulGroups;
+ if (maxStartTxId > sinceTxId) {
+ usefulGroups = tailGroups.headMap(maxStartTxId);
+ } else {
+ usefulGroups = new TreeMap();
+ }
+
+ if (usefulGroups.size() > tailGroups.size()) {
+ LOG.debug("Excluded " + (tailGroups.size() - usefulGroups.size()) +
+ " groups of logs because they start with a txid higher than max " +
+ "txid " + sinceTxId);
+ }
+
+
+ for (Map.Entry entry : usefulGroups.entrySet()) {
+ long logStartTxId = entry.getKey();
+ LogGroup logGroup = entry.getValue();
+
+ logGroup.planRecovery();
+
+ if (expectedTxId != FSConstants.INVALID_TXID && logStartTxId != expectedTxId) {
+ throw new IOException("Expected next log group would start at txid " +
+ expectedTxId + " but starts at txid " + logStartTxId);
+ }
+
+ // We can pick any of the non-corrupt logs here
+ recoveryLogs.add(logGroup.getBestNonCorruptLog());
+
+ // If this log group was finalized, we know to expect the next
+ // log group to start at the following txid (ie no gaps)
+ if (logGroup.hasKnownLastTxId()) {
+ expectedTxId = logGroup.getLastTxId() + 1;
+ } else {
+ // the log group was in-progress so we don't know what ID
+ // the next group should start from.
+ expectedTxId = FSConstants.INVALID_TXID;
+ }
+ }
+
+ long lastLogGroupStartTxId = usefulGroups.isEmpty() ?
+ 0 : usefulGroups.lastKey();
+ if (maxSeenTxId > sinceTxId &&
+ maxSeenTxId > lastLogGroupStartTxId) {
+ String msg = "At least one storage directory indicated it has seen a " +
+ "log segment starting at txid " + maxSeenTxId;
+ if (usefulGroups.isEmpty()) {
+ msg += " but there are no logs to load.";
+ } else {
+ msg += " but the most recent log file found starts with txid " +
+ lastLogGroupStartTxId;
+ }
+ throw new IOException(msg);
+ }
+
+ return new LogLoadPlan(recoveryLogs,
+ Lists.newArrayList(usefulGroups.values()));
+
+ }
+
+ @Override
+ public boolean needToSave() {
+ return needToSave;
+ }
+
+
+ RemoteEditLogManifest getEditLogManifest(long sinceTxId) {
+ List logs = Lists.newArrayList();
+ for (LogGroup g : logGroups.values()) {
+ if (!g.hasFinalized) continue;
+
+ FoundEditLog fel = g.getBestNonCorruptLog();
+ if (fel.getLastTxId() < sinceTxId) continue;
+
+ logs.add(new RemoteEditLog(fel.getStartTxId(),
+ fel.getLastTxId()));
+ }
+
+ return new RemoteEditLogManifest(logs);
+ }
+
+ /**
+ * A group of logs that all start at the same txid.
+ *
+ * Handles determining which logs are corrupt and which should be considered
+ * candidates for loading.
+ */
+ static class LogGroup {
+ long startTxId;
+ List logs = new ArrayList();;
+ private Set endTxIds = new TreeSet();
+ private boolean hasInProgress = false;
+ private boolean hasFinalized = false;
+
+ LogGroup(long startTxId) {
+ this.startTxId = startTxId;
+ }
+
+ FoundEditLog getBestNonCorruptLog() {
+ // First look for non-corrupt finalized logs
+ for (FoundEditLog log : logs) {
+ if (!log.isCorrupt() && !log.isInProgress()) {
+ return log;
+ }
+ }
+ // Then look for non-corrupt in-progress logs
+ for (FoundEditLog log : logs) {
+ if (!log.isCorrupt()) {
+ return log;
+ }
+ }
+
+ // We should never get here, because we don't get to the planning stage
+ // without calling planRecovery first, and if we've called planRecovery,
+ // we would have already thrown if there were no non-corrupt logs!
+ throw new IllegalStateException(
+ "No non-corrupt logs for txid " + startTxId);
+ }
+
+ /**
+ * @return true if we can determine the last txid in this log group.
+ */
+ boolean hasKnownLastTxId() {
+ for (FoundEditLog log : logs) {
+ if (!log.isInProgress()) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * @return the last txid included in the logs in this group
+ * @throws IllegalStateException if it is unknown -
+ * {@see #hasKnownLastTxId()}
+ */
+ long getLastTxId() {
+ for (FoundEditLog log : logs) {
+ if (!log.isInProgress()) {
+ return log.lastTxId;
+ }
+ }
+ throw new IllegalStateException("LogGroup only has in-progress logs");
+ }
+
+
+ void add(FoundEditLog log) {
+ assert log.getStartTxId() == startTxId;
+ logs.add(log);
+
+ if (log.isInProgress()) {
+ hasInProgress = true;
+ } else {
+ hasFinalized = true;
+ endTxIds.add(log.lastTxId);
+ }
+ }
+
+ void planRecovery() throws IOException {
+ assert hasInProgress || hasFinalized;
+
+ checkConsistentEndTxIds();
+
+ if (hasFinalized && hasInProgress) {
+ planMixedLogRecovery();
+ } else if (!hasFinalized && hasInProgress) {
+ planAllInProgressRecovery();
+ } else if (hasFinalized && !hasInProgress) {
+ LOG.debug("No recovery necessary for logs starting at txid " +
+ startTxId);
+ }
+ }
+
+ /**
+ * Recovery case for when some logs in the group were in-progress, and
+ * others were finalized. This happens when one of the storage
+ * directories fails.
+ *
+ * The in-progress logs in this case should be considered corrupt.
+ */
+ private void planMixedLogRecovery() throws IOException {
+ for (FoundEditLog log : logs) {
+ if (log.isInProgress()) {
+ LOG.warn("Log at " + log.getFile() + " is in progress, but " +
+ "other logs starting at the same txid " + startTxId +
+ " are finalized. Moving aside.");
+ log.markCorrupt();
+ }
+ }
+ }
+
+ /**
+ * Recovery case for when all of the logs in the group were in progress.
+ * This happens if the NN completely crashes and restarts. In this case
+ * we check the non-zero lengths of each log file, and any logs that are
+ * less than the max of these lengths are considered corrupt.
+ */
+ private void planAllInProgressRecovery() throws IOException {
+ // We only have in-progress logs. We need to figure out which logs have
+ // the latest data to reccover them
+ LOG.warn("Logs beginning at txid " + startTxId + " were are all " +
+ "in-progress (probably truncated due to a previous NameNode " +
+ "crash)");
+ if (logs.size() == 1) {
+ // Only one log, it's our only choice!
+ FoundEditLog log = logs.get(0);
+ if (log.validateLog().numTransactions == 0) {
+ // If it has no transactions, we should consider it corrupt just
+ // to be conservative.
+ // See comment below for similar case
+ LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+ "it has no transactions in it.");
+ log.markCorrupt();
+ }
+ return;
+ }
+
+ long maxValidTxnCount = Long.MIN_VALUE;
+ for (FoundEditLog log : logs) {
+ long validTxnCount = log.validateLog().numTransactions;
+ LOG.warn(" Log " + log.getFile() +
+ " valid txns=" + validTxnCount +
+ " valid len=" + log.validateLog().validLength);
+ maxValidTxnCount = Math.max(maxValidTxnCount, validTxnCount);
+ }
+
+ for (FoundEditLog log : logs) {
+ long txns = log.validateLog().numTransactions;
+ if (txns < maxValidTxnCount) {
+ LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+ "it is has only " + txns + " valid txns whereas another " +
+ "log has " + maxValidTxnCount);
+ log.markCorrupt();
+ } else if (txns == 0) {
+ // this can happen if the NN crashes right after rolling a log
+ // but before the START_LOG_SEGMENT txn is written. Since the log
+ // is empty, we can just move it aside to its corrupt name.
+ LOG.warn("Marking log at " + log.getFile() + " as corrupt since " +
+ "it has no transactions in it.");
+ log.markCorrupt();
+ }
+ }
+ }
+
+ /**
+ * Check for the case when we have multiple finalized logs and they have
+ * different ending transaction IDs. This violates an invariant that all
+ * log directories should roll together. We should abort in this case.
+ */
+ private void checkConsistentEndTxIds() throws IOException {
+ if (hasFinalized && endTxIds.size() > 1) {
+ throw new IOException("More than one ending txid was found " +
+ "for logs starting at txid " + startTxId + ". " +
+ "Found: " + StringUtils.join(endTxIds, ','));
+ }
+ }
+
+ void recover() throws IOException {
+ for (FoundEditLog log : logs) {
+ if (log.isCorrupt()) {
+ log.moveAsideCorruptFile();
+ } else if (log.isInProgress()) {
+ log.finalizeLog();
+ }
+ }
+ }
+ }
+
+ /**
+ * Record of an image that has been located and had its filename parsed.
+ */
+ static class FoundFSImage {
+ final StorageDirectory sd;
+ final long txId;
+ private final File file;
+
+ FoundFSImage(StorageDirectory sd, File file, long txId) {
+ assert txId >= 0 : "Invalid txid on " + file +": " + txId;
+
+ this.sd = sd;
+ this.txId = txId;
+ this.file = file;
+ }
+
+ File getFile() {
+ return file;
+ }
+
+ public long getTxId() {
+ return txId;
+ }
+
+ @Override
+ public String toString() {
+ return file.toString();
+ }
+ }
+
+ /**
+ * Record of an edit log that has been located and had its filename parsed.
+ */
+ static class FoundEditLog {
+ File file;
+ final long startTxId;
+ long lastTxId;
+
+ private EditLogValidation cachedValidation = null;
+ private boolean isCorrupt = false;
+
+ static final long UNKNOWN_END = -1;
+
+ FoundEditLog(File file,
+ long startTxId, long endTxId) {
+ assert endTxId == UNKNOWN_END || endTxId >= startTxId;
+ assert startTxId > 0;
+ assert file != null;
+
+ this.startTxId = startTxId;
+ this.lastTxId = endTxId;
+ this.file = file;
+ }
+
+ public void finalizeLog() throws IOException {
+ long numTransactions = validateLog().numTransactions;
+ long lastTxId = startTxId + numTransactions - 1;
+ File dst = new File(file.getParentFile(),
+ NNStorage.getFinalizedEditsFileName(startTxId, lastTxId));
+ LOG.info("Finalizing edits log " + file + " by renaming to "
+ + dst.getName());
+ if (!file.renameTo(dst)) {
+ throw new IOException("Couldn't finalize log " +
+ file + " to " + dst);
+ }
+ this.lastTxId = lastTxId;
+ file = dst;
+ }
+
+ long getStartTxId() {
+ return startTxId;
+ }
+
+ long getLastTxId() {
+ return lastTxId;
+ }
+
+ EditLogValidation validateLog() throws IOException {
+ if (cachedValidation == null) {
+ cachedValidation = FSEditLogLoader.validateEditLog(file);
+ }
+ return cachedValidation;
+ }
+
+ boolean isInProgress() {
+ return (lastTxId == UNKNOWN_END);
+ }
+
+ File getFile() {
+ return file;
+ }
+
+ void markCorrupt() {
+ isCorrupt = true;
+ }
+
+ boolean isCorrupt() {
+ return isCorrupt;
+ }
+
+ void moveAsideCorruptFile() throws IOException {
+ assert isCorrupt;
+
+ File src = file;
+ File dst = new File(src.getParent(), src.getName() + ".corrupt");
+ boolean success = src.renameTo(dst);
+ if (!success) {
+ throw new IOException(
+ "Couldn't rename corrupt log " + src + " to " + dst);
+ }
+ file = dst;
+ }
+
+ @Override
+ public String toString() {
+ return file.toString();
+ }
+ }
+
+ static class TransactionalLoadPlan extends LoadPlan {
+ final FoundFSImage image;
+ final LogLoadPlan logPlan;
+
+ public TransactionalLoadPlan(FoundFSImage image,
+ LogLoadPlan logPlan) {
+ super();
+ this.image = image;
+ this.logPlan = logPlan;
+ }
+
+ @Override
+ boolean doRecovery() throws IOException {
+ logPlan.doRecovery();
+ return false;
+ }
+
+ @Override
+ File getImageFile() {
+ return image.getFile();
+ }
+
+ @Override
+ List getEditsFiles() {
+ return logPlan.getEditsFiles();
+ }
+
+ @Override
+ StorageDirectory getStorageDirectoryForProperties() {
+ return image.sd;
+ }
+ }
+
+ static class LogLoadPlan {
+ final List editLogs;
+ final List logGroupsToRecover;
+
+ LogLoadPlan(List editLogs,
+ List logGroupsToRecover) {
+ this.editLogs = editLogs;
+ this.logGroupsToRecover = logGroupsToRecover;
+ }
+
+ public void doRecovery() throws IOException {
+ for (LogGroup g : logGroupsToRecover) {
+ g.recover();
+ }
+ }
+
+ public List getEditsFiles() {
+ List ret = new ArrayList();
+ for (FoundEditLog log : editLogs) {
+ ret.add(log.getFile());
+ }
+ return ret;
+ }
+ }
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
index 3b585732881..b3332ce468c 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
@@ -105,6 +105,7 @@
import org.apache.hadoop.hdfs.server.blockmanagement.UnderReplicatedBlocks;
import org.apache.hadoop.hdfs.server.common.GenerationStamp;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.BlockUCState;
+import org.apache.hadoop.hdfs.server.common.HdfsConstants.NamenodeRole;
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.UpgradeStatusReport;
@@ -124,6 +125,7 @@
import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
@@ -323,8 +325,7 @@ private void initialize(Configuration conf, FSImage fsImage)
if(fsImage == null) {
this.dir = new FSDirectory(this, conf);
StartupOption startOpt = NameNode.getStartupOption(conf);
- this.dir.loadFSImage(getNamespaceDirs(conf),
- getNamespaceEditsDirs(conf), startOpt);
+ this.dir.loadFSImage(startOpt);
long timeTakenToLoadFSImage = now() - systemStart;
LOG.info("Finished loading FSImage in " + timeTakenToLoadFSImage + " msecs");
NameNode.getNameNodeMetrics().setFsImageLoadTime(
@@ -392,8 +393,9 @@ public static Collection getStorageDirs(Configuration conf,
+ propertyName + "\" in hdfs-site.xml;" +
"\n\t\t- use Backup Node as a persistent and up-to-date storage " +
"of the file system meta-data.");
- } else if (dirNames.isEmpty())
- dirNames.add("file:///tmp/hadoop/dfs/name");
+ } else if (dirNames.isEmpty()) {
+ dirNames = Collections.singletonList("file:///tmp/hadoop/dfs/name");
+ }
return Util.stringCollectionAsURIs(dirNames);
}
@@ -3258,7 +3260,7 @@ void saveNamespace() throws AccessControlException, IOException {
throw new IOException("Safe mode should be turned ON " +
"in order to create namespace image.");
}
- getFSImage().saveNamespace(true);
+ getFSImage().saveNamespace();
LOG.info("New namespace image has been created.");
} finally {
readUnlock();
@@ -4003,8 +4005,8 @@ String getSafeModeTip() {
}
}
- long getEditLogSize() throws IOException {
- return getEditLog().getEditLogSize();
+ public long getTransactionID() {
+ return getEditLog().getSyncTxId();
}
CheckpointSignature rollEditLog() throws IOException {
@@ -4019,24 +4021,9 @@ CheckpointSignature rollEditLog() throws IOException {
writeUnlock();
}
}
-
- /**
- * Moves fsimage.ckpt to fsImage and edits.new to edits
- * Reopens the new edits file.
- *
- * @param sig the signature of this checkpoint (old image)
- */
- void rollFSImage(CheckpointSignature sig) throws IOException {
- writeLock();
- try {
- if (isInSafeMode()) {
- throw new SafeModeException("Image not rolled", safeMode);
- }
- LOG.info("Roll FSImage from " + Server.getRemoteAddress());
- getFSImage().rollFSImage(sig, true);
- } finally {
- writeUnlock();
- }
+
+ public RemoteEditLogManifest getEditLogManifest(long sinceTxId) throws IOException {
+ return getEditLog().getEditLogManifest(sinceTxId);
}
NamenodeCommand startCheckpoint(
@@ -4516,31 +4503,29 @@ void saveFilesUnderConstruction(DataOutputStream out) throws IOException {
}
/**
- * Register a name-node.
- *
- * Registration is allowed if there is no ongoing streaming to
- * another backup node.
- * We currently allow only one backup node, but multiple chackpointers
- * if there are no backups.
+ * Register a Backup name-node, verifying that it belongs
+ * to the correct namespace, and adding it to the set of
+ * active journals if necessary.
*
- * @param registration
- * @throws IOException
+ * @param bnReg registration of the new BackupNode
+ * @param nnReg registration of this NameNode
+ * @throws IOException if the namespace IDs do not match
*/
- void registerBackupNode(NamenodeRegistration registration)
- throws IOException {
+ void registerBackupNode(NamenodeRegistration bnReg,
+ NamenodeRegistration nnReg) throws IOException {
writeLock();
try {
if(getFSImage().getStorage().getNamespaceID()
- != registration.getNamespaceID())
+ != bnReg.getNamespaceID())
throw new IOException("Incompatible namespaceIDs: "
+ " Namenode namespaceID = "
+ getFSImage().getStorage().getNamespaceID() + "; "
- + registration.getRole() +
- " node namespaceID = " + registration.getNamespaceID());
- boolean regAllowed = getEditLog().checkBackupRegistration(registration);
- if(!regAllowed)
- throw new IOException("Registration is not allowed. " +
- "Another node is registered as a backup.");
+ + bnReg.getRole() +
+ " node namespaceID = " + bnReg.getNamespaceID());
+ if (bnReg.getRole() == NamenodeRole.BACKUP) {
+ getFSImage().getEditLog().registerBackupNode(
+ bnReg, nnReg);
+ }
} finally {
writeUnlock();
}
@@ -5080,4 +5065,6 @@ public BlockManager getBlockManager() {
void removeDecomNodeFromList(List nodeList) {
getBlockManager().getDatanodeManager().removeDecomNodeFromList(nodeList);
}
+
+
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java
new file mode 100644
index 00000000000..360a118fa27
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/FileJournalManager.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.FoundEditLog;
+import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger;
+
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
+
+/**
+ * Journal manager for the common case of edits files being written
+ * to a storage directory.
+ *
+ * Note: this class is not thread-safe and should be externally
+ * synchronized.
+ */
+class FileJournalManager implements JournalManager {
+ private static final Log LOG = LogFactory.getLog(FileJournalManager.class);
+
+ private final StorageDirectory sd;
+ private int outputBufferCapacity = 512*1024;
+
+ public FileJournalManager(StorageDirectory sd) {
+ this.sd = sd;
+ }
+
+ @Override
+ public EditLogOutputStream startLogSegment(long txid) throws IOException {
+ File newInProgress = NNStorage.getInProgressEditsFile(sd, txid);
+ EditLogOutputStream stm = new EditLogFileOutputStream(newInProgress,
+ outputBufferCapacity);
+ stm.create();
+ return stm;
+ }
+
+ @Override
+ public void finalizeLogSegment(long firstTxId, long lastTxId)
+ throws IOException {
+ File inprogressFile = NNStorage.getInProgressEditsFile(
+ sd, firstTxId);
+ File dstFile = NNStorage.getFinalizedEditsFile(
+ sd, firstTxId, lastTxId);
+ LOG.debug("Finalizing edits file " + inprogressFile + " -> " + dstFile);
+
+ Preconditions.checkState(!dstFile.exists(),
+ "Can't finalize edits file " + inprogressFile + " since finalized file " +
+ "already exists");
+ if (!inprogressFile.renameTo(dstFile)) {
+ throw new IOException("Unable to finalize edits file " + inprogressFile);
+ }
+ }
+
+ @VisibleForTesting
+ public StorageDirectory getStorageDirectory() {
+ return sd;
+ }
+
+ @Override
+ public String toString() {
+ return "FileJournalManager for storage directory " + sd;
+ }
+
+ @Override
+ public void setOutputBufferCapacity(int size) {
+ this.outputBufferCapacity = size;
+ }
+
+ @Override
+ public void purgeLogsOlderThan(long minTxIdToKeep, StoragePurger purger)
+ throws IOException {
+ File[] files = FileUtil.listFiles(sd.getCurrentDir());
+ List editLogs =
+ FSImageTransactionalStorageInspector.matchEditLogs(files);
+ for (FoundEditLog log : editLogs) {
+ if (log.getStartTxId() < minTxIdToKeep &&
+ log.getLastTxId() < minTxIdToKeep) {
+ purger.purgeLog(log);
+ }
+ }
+ }
+
+ @Override
+ public EditLogInputStream getInProgressInputStream(long segmentStartsAtTxId)
+ throws IOException {
+ File f = NNStorage.getInProgressEditsFile(sd, segmentStartsAtTxId);
+ return new EditLogFileInputStream(f);
+ }
+
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java
index 04bfe845a43..8753b270f1d 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/GetImageServlet.java
@@ -20,6 +20,8 @@
import java.security.PrivilegedExceptionAction;
import java.util.*;
import java.io.*;
+import java.net.InetSocketAddress;
+
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
@@ -34,11 +36,16 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.server.common.JspHelper;
+import org.apache.hadoop.hdfs.server.common.StorageInfo;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
+import org.apache.hadoop.hdfs.util.MD5FileUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils;
+import com.google.common.base.Preconditions;
+
/**
* This class is used in Namesystem's jetty to retrieve a file.
* Typically used by the Secondary NameNode to retrieve image and
@@ -50,15 +57,21 @@ public class GetImageServlet extends HttpServlet {
private static final Log LOG = LogFactory.getLog(GetImageServlet.class);
- @SuppressWarnings("unchecked")
+ private static final String TXID_PARAM = "txid";
+ private static final String START_TXID_PARAM = "startTxId";
+ private static final String END_TXID_PARAM = "endTxId";
+ private static final String STORAGEINFO_PARAM = "storageInfo";
+
+ private static Set currentlyDownloadingCheckpoints =
+ Collections.synchronizedSet(new HashSet());
+
public void doGet(final HttpServletRequest request,
final HttpServletResponse response
) throws ServletException, IOException {
- Map pmap = request.getParameterMap();
try {
ServletContext context = getServletContext();
final FSImage nnImage = NameNodeHttpServer.getFsImageFromContext(context);
- final TransferFsImage ff = new TransferFsImage(pmap, request, response);
+ final GetImageParams parsedParams = new GetImageParams(request, response);
final Configuration conf =
(Configuration)getServletContext().getAttribute(JspHelper.CURRENT_CONF);
@@ -70,45 +83,77 @@ public void doGet(final HttpServletRequest request,
+ request.getRemoteHost());
return;
}
-
+
+ String myStorageInfoString = nnImage.getStorage().toColonSeparatedString();
+ String theirStorageInfoString = parsedParams.getStorageInfoString();
+ if (theirStorageInfoString != null &&
+ !myStorageInfoString.equals(theirStorageInfoString)) {
+ response.sendError(HttpServletResponse.SC_FORBIDDEN,
+ "This namenode has storage info " + myStorageInfoString +
+ " but the secondary expected " + theirStorageInfoString);
+ LOG.warn("Received an invalid request file transfer request " +
+ "from a secondary with storage info " + theirStorageInfoString);
+ return;
+ }
+
UserGroupInformation.getCurrentUser().doAs(new PrivilegedExceptionAction() {
@Override
public Void run() throws Exception {
- if (ff.getImage()) {
- response.setHeader(TransferFsImage.CONTENT_LENGTH,
- String.valueOf(nnImage.getStorage()
- .getFsImageName().length()));
- // send fsImage
- TransferFsImage.getFileServer(response.getOutputStream(),
- nnImage.getStorage().getFsImageName(),
- getThrottler(conf));
- } else if (ff.getEdit()) {
- response.setHeader(TransferFsImage.CONTENT_LENGTH,
- String.valueOf(nnImage.getStorage()
- .getFsEditName().length()));
- // send edits
- TransferFsImage.getFileServer(response.getOutputStream(),
- nnImage.getStorage().getFsEditName(),
- getThrottler(conf));
- } else if (ff.putImage()) {
- // issue a HTTP get request to download the new fsimage
- nnImage.validateCheckpointUpload(ff.getToken());
- nnImage.newImageDigest = ff.getNewChecksum();
- MD5Hash downloadImageDigest = reloginIfNecessary().doAs(
- new PrivilegedExceptionAction() {
- @Override
- public MD5Hash run() throws Exception {
- return TransferFsImage.getFileClient(
- ff.getInfoServer(), "getimage=1",
- nnImage.getStorage().getFsImageNameCheckpoint(), true);
- }
- });
- if (!nnImage.newImageDigest.equals(downloadImageDigest)) {
- throw new IOException("The downloaded image is corrupt," +
- " expecting a checksum " + nnImage.newImageDigest +
- " but received a checksum " + downloadImageDigest);
+ if (parsedParams.isGetImage()) {
+ long txid = parsedParams.getTxId();
+ File imageFile = nnImage.getStorage().getFsImageName(txid);
+ if (imageFile == null) {
+ throw new IOException("Could not find image with txid " + txid);
+ }
+ setVerificationHeaders(response, imageFile);
+ // send fsImage
+ TransferFsImage.getFileServer(response.getOutputStream(), imageFile,
+ getThrottler(conf));
+ } else if (parsedParams.isGetEdit()) {
+ long startTxId = parsedParams.getStartTxId();
+ long endTxId = parsedParams.getEndTxId();
+
+ File editFile = nnImage.getStorage()
+ .findFinalizedEditsFile(startTxId, endTxId);
+ setVerificationHeaders(response, editFile);
+
+ // send edits
+ TransferFsImage.getFileServer(response.getOutputStream(), editFile,
+ getThrottler(conf));
+ } else if (parsedParams.isPutImage()) {
+ final long txid = parsedParams.getTxId();
+
+ if (! currentlyDownloadingCheckpoints.add(txid)) {
+ throw new IOException(
+ "Another checkpointer is already in the process of uploading a" +
+ " checkpoint made at transaction ID " + txid);
+ }
+
+ try {
+ if (nnImage.getStorage().findImageFile(txid) != null) {
+ throw new IOException(
+ "Another checkpointer already uploaded an checkpoint " +
+ "for txid " + txid);
+ }
+
+ // issue a HTTP get request to download the new fsimage
+ MD5Hash downloadImageDigest = reloginIfNecessary().doAs(
+ new PrivilegedExceptionAction() {
+ @Override
+ public MD5Hash run() throws Exception {
+ return TransferFsImage.downloadImageToStorage(
+ parsedParams.getInfoServer(), txid,
+ nnImage.getStorage(), true);
+ }
+ });
+ nnImage.saveDigestAndRenameCheckpointImage(txid, downloadImageDigest);
+
+ // Now that we have a new checkpoint, we might be able to
+ // remove some old ones.
+ nnImage.purgeOldStorage();
+ } finally {
+ currentlyDownloadingCheckpoints.remove(txid);
}
- nnImage.checkpointUploadDone();
}
return null;
}
@@ -182,4 +227,148 @@ protected boolean isValidRequestor(String remoteUser, Configuration conf)
if(LOG.isDebugEnabled()) LOG.debug("isValidRequestor is rejecting: " + remoteUser);
return false;
}
+
+ /**
+ * Set headers for content length, and, if available, md5.
+ * @throws IOException
+ */
+ private void setVerificationHeaders(HttpServletResponse response, File file)
+ throws IOException {
+ response.setHeader(TransferFsImage.CONTENT_LENGTH,
+ String.valueOf(file.length()));
+ MD5Hash hash = MD5FileUtils.readStoredMd5ForFile(file);
+ if (hash != null) {
+ response.setHeader(TransferFsImage.MD5_HEADER, hash.toString());
+ }
+ }
+
+ static String getParamStringForImage(long txid,
+ StorageInfo remoteStorageInfo) {
+ return "getimage=1&" + TXID_PARAM + "=" + txid
+ + "&" + STORAGEINFO_PARAM + "=" +
+ remoteStorageInfo.toColonSeparatedString();
+
+ }
+
+ static String getParamStringForLog(RemoteEditLog log,
+ StorageInfo remoteStorageInfo) {
+ return "getedit=1&" + START_TXID_PARAM + "=" + log.getStartTxId()
+ + "&" + END_TXID_PARAM + "=" + log.getEndTxId()
+ + "&" + STORAGEINFO_PARAM + "=" +
+ remoteStorageInfo.toColonSeparatedString();
+ }
+
+ static String getParamStringToPutImage(long txid,
+ InetSocketAddress imageListenAddress, NNStorage storage) {
+
+ return "putimage=1" +
+ "&" + TXID_PARAM + "=" + txid +
+ "&port=" + imageListenAddress.getPort() +
+ "&machine=" + imageListenAddress.getHostName()
+ + "&" + STORAGEINFO_PARAM + "=" +
+ storage.toColonSeparatedString();
+ }
+
+
+ static class GetImageParams {
+ private boolean isGetImage;
+ private boolean isGetEdit;
+ private boolean isPutImage;
+ private int remoteport;
+ private String machineName;
+ private long startTxId, endTxId, txId;
+ private String storageInfoString;
+
+ /**
+ * @param request the object from which this servlet reads the url contents
+ * @param response the object into which this servlet writes the url contents
+ * @throws IOException if the request is bad
+ */
+ public GetImageParams(HttpServletRequest request,
+ HttpServletResponse response
+ ) throws IOException {
+ @SuppressWarnings("unchecked")
+ Map pmap = request.getParameterMap();
+ isGetImage = isGetEdit = isPutImage = false;
+ remoteport = 0;
+ machineName = null;
+
+ for (Map.Entry entry : pmap.entrySet()) {
+ String key = entry.getKey();
+ String[] val = entry.getValue();
+ if (key.equals("getimage")) {
+ isGetImage = true;
+ txId = parseLongParam(request, TXID_PARAM);
+ } else if (key.equals("getedit")) {
+ isGetEdit = true;
+ startTxId = parseLongParam(request, START_TXID_PARAM);
+ endTxId = parseLongParam(request, END_TXID_PARAM);
+ } else if (key.equals("putimage")) {
+ isPutImage = true;
+ txId = parseLongParam(request, TXID_PARAM);
+ } else if (key.equals("port")) {
+ remoteport = new Integer(val[0]).intValue();
+ } else if (key.equals("machine")) {
+ machineName = val[0];
+ } else if (key.equals(STORAGEINFO_PARAM)) {
+ storageInfoString = val[0];
+ }
+ }
+
+ int numGets = (isGetImage?1:0) + (isGetEdit?1:0);
+ if ((numGets > 1) || (numGets == 0) && !isPutImage) {
+ throw new IOException("Illegal parameters to TransferFsImage");
+ }
+ }
+
+ public String getStorageInfoString() {
+ return storageInfoString;
+ }
+
+ public long getTxId() {
+ Preconditions.checkState(isGetImage || isPutImage);
+ return txId;
+ }
+
+ public long getStartTxId() {
+ Preconditions.checkState(isGetEdit);
+ return startTxId;
+ }
+
+ public long getEndTxId() {
+ Preconditions.checkState(isGetEdit);
+ return endTxId;
+ }
+
+ boolean isGetEdit() {
+ return isGetEdit;
+ }
+
+ boolean isGetImage() {
+ return isGetImage;
+ }
+
+ boolean isPutImage() {
+ return isPutImage;
+ }
+
+ String getInfoServer() throws IOException{
+ if (machineName == null || remoteport == 0) {
+ throw new IOException ("MachineName and port undefined");
+ }
+ return machineName + ":" + remoteport;
+ }
+
+ private static long parseLongParam(HttpServletRequest request, String param)
+ throws IOException {
+ // Parse the 'txid' parameter which indicates which image is to be
+ // fetched.
+ String paramStr = request.getParameter(param);
+ if (paramStr == null) {
+ throw new IOException("Invalid request has no " + param + " parameter");
+ }
+
+ return Long.valueOf(paramStr);
+ }
+ }
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java
new file mode 100644
index 00000000000..56ea5c25129
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/JournalManager.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.IOException;
+
+import org.apache.hadoop.hdfs.server.namenode.NNStorageRetentionManager.StoragePurger;
+
+/**
+ * A JournalManager is responsible for managing a single place of storing
+ * edit logs. It may correspond to multiple files, a backup node, etc.
+ * Even when the actual underlying storage is rolled, or failed and restored,
+ * each conceptual place of storage corresponds to exactly one instance of
+ * this class, which is created when the EditLog is first opened.
+ */
+interface JournalManager {
+ /**
+ * Begin writing to a new segment of the log stream, which starts at
+ * the given transaction ID.
+ */
+ EditLogOutputStream startLogSegment(long txId) throws IOException;
+
+ /**
+ * Mark the log segment that spans from firstTxId to lastTxId
+ * as finalized and complete.
+ */
+ void finalizeLogSegment(long firstTxId, long lastTxId) throws IOException;
+
+ /**
+ * Set the amount of memory that this stream should use to buffer edits
+ */
+ void setOutputBufferCapacity(int size);
+
+ /**
+ * The JournalManager may archive/purge any logs for transactions less than
+ * or equal to minImageTxId.
+ *
+ * @param minTxIdToKeep the earliest txid that must be retained after purging
+ * old logs
+ * @param purger the purging implementation to use
+ * @throws IOException if purging fails
+ */
+ void purgeLogsOlderThan(long minTxIdToKeep, StoragePurger purger)
+ throws IOException;
+
+ /**
+ * @return an EditLogInputStream that reads from the same log that
+ * the edit log is currently writing. May return null if this journal
+ * manager does not support this operation.
+ */
+ EditLogInputStream getInProgressInputStream(long segmentStartsAtTxId)
+ throws IOException;
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
index 29a638567f4..00461e2fb3f 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorage.java
@@ -17,22 +17,20 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
-import static org.apache.hadoop.hdfs.server.common.Util.now;
-
+import java.io.BufferedReader;
import java.io.Closeable;
-import java.io.DataInputStream;
-import java.io.DataOutputStream;
import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
+import java.io.FileReader;
import java.io.IOException;
import java.io.RandomAccessFile;
+import java.io.OutputStream;
import java.net.URI;
import java.net.UnknownHostException;
import java.security.NoSuchAlgorithmException;
import java.security.SecureRandom;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
@@ -51,13 +49,17 @@
import org.apache.hadoop.hdfs.server.common.HdfsConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.common.Storage;
-import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.common.UpgradeManager;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.namenode.JournalStream.JournalType;
-import org.apache.hadoop.io.MD5Hash;
+import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
+
+import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.DNS;
+import com.google.common.base.Preconditions;
+import com.google.common.annotations.VisibleForTesting;
+
/**
* NNStorage is responsible for management of the StorageDirectories used by
* the NameNode.
@@ -66,17 +68,19 @@
public class NNStorage extends Storage implements Closeable {
private static final Log LOG = LogFactory.getLog(NNStorage.class.getName());
- static final String MESSAGE_DIGEST_PROPERTY = "imageMD5Digest";
-
+ static final String DEPRECATED_MESSAGE_DIGEST_PROPERTY = "imageMD5Digest";
+
//
// The filenames used for storing the images
//
enum NameNodeFile {
IMAGE ("fsimage"),
- TIME ("fstime"),
+ TIME ("fstime"), // from "old" pre-HDFS-1073 format
+ SEEN_TXID ("seen_txid"),
EDITS ("edits"),
IMAGE_NEW ("fsimage.ckpt"),
- EDITS_NEW ("edits.new");
+ EDITS_NEW ("edits.new"), // from "old" pre-HDFS-1073 format
+ EDITS_INPROGRESS ("edits_inprogress");
private String fileName = null;
private NameNodeFile(String name) { this.fileName = name; }
@@ -106,42 +110,9 @@ public boolean isOfType(StorageDirType type) {
}
}
- /**
- * Interface to be implemented by classes which make use of storage
- * directories. They are notified when a StorageDirectory is causing errors,
- * becoming available or being formatted.
- *
- * This allows the implementors of the interface take their own specific
- * action on the StorageDirectory when this occurs.
- */
- interface NNStorageListener {
- /**
- * An error has occurred with a StorageDirectory.
- * @param sd The storage directory causing the error.
- * @throws IOException
- */
- void errorOccurred(StorageDirectory sd) throws IOException;
-
- /**
- * A storage directory has been formatted.
- * @param sd The storage directory being formatted.
- * @throws IOException
- */
- void formatOccurred(StorageDirectory sd) throws IOException;
-
- /**
- * A storage directory is now available use.
- * @param sd The storage directory which has become available.
- * @throws IOException
- */
- void directoryAvailable(StorageDirectory sd) throws IOException;
- }
-
- final private List listeners;
private UpgradeManager upgradeManager = null;
- protected MD5Hash imageDigest = null;
protected String blockpoolID = ""; // id of the block pool
-
+
/**
* flag that controls if we try to restore failed storages
*/
@@ -149,7 +120,13 @@ interface NNStorageListener {
private Object restorationLock = new Object();
private boolean disablePreUpgradableLayoutCheck = false;
- private long checkpointTime = -1L; // The age of the image
+
+ /**
+ * TxId of the last transaction that was included in the most
+ * recent fsimage file. This does not include any transactions
+ * that have since been written to the edit log.
+ */
+ protected long mostRecentCheckpointTxId = FSConstants.INVALID_TXID;
/**
* list of failed (and thus removed) storages
@@ -158,27 +135,26 @@ interface NNStorageListener {
= new CopyOnWriteArrayList();
/**
- * Construct the NNStorage.
- * @param conf Namenode configuration.
+ * Properties from old layout versions that may be needed
+ * during upgrade only.
*/
- public NNStorage(Configuration conf) {
- super(NodeType.NAME_NODE);
-
- storageDirs = new CopyOnWriteArrayList();
- this.listeners = new CopyOnWriteArrayList();
- }
+ private HashMap deprecatedProperties;
/**
* Construct the NNStorage.
- * @param storageInfo storage information
- * @param bpid block pool Id
+ * @param conf Namenode configuration.
+ * @param imageDirs Directories the image can be stored in.
+ * @param editsDirs Directories the editlog can be stored in.
+ * @throws IOException if any directories are inaccessible.
*/
- public NNStorage(StorageInfo storageInfo, String bpid) {
- super(NodeType.NAME_NODE, storageInfo);
+ public NNStorage(Configuration conf,
+ Collection imageDirs, Collection editsDirs)
+ throws IOException {
+ super(NodeType.NAME_NODE);
storageDirs = new CopyOnWriteArrayList();
- this.listeners = new CopyOnWriteArrayList();
- this.blockpoolID = bpid;
+
+ setStorageDirectories(imageDirs, editsDirs);
}
@Override // Storage
@@ -207,7 +183,6 @@ public boolean isPreUpgradableLayout(StorageDirectory sd) throws IOException {
@Override // Closeable
public void close() throws IOException {
- listeners.clear();
unlockAll();
storageDirs.clear();
}
@@ -232,10 +207,7 @@ boolean getRestoreFailedStorage() {
/**
* See if any of removed storages is "writable" again, and can be returned
- * into service. If saveNamespace is set, then this method is being
- * called from saveNamespace.
- *
- * @param saveNamespace Whether method is being called from saveNamespace()
+ * into service.
*/
void attemptRestoreRemovedStorage() {
// if directory is "alive" - copy the images there...
@@ -253,23 +225,10 @@ void attemptRestoreRemovedStorage() {
LOG.info("currently disabled dir " + root.getAbsolutePath() +
"; type="+sd.getStorageDirType()
+ ";canwrite="+root.canWrite());
- try {
-
- if(root.exists() && root.canWrite()) {
- // when we try to restore we just need to remove all the data
- // without saving current in-memory state (which could've changed).
- sd.clearDirectory();
-
- LOG.info("restoring dir " + sd.getRoot().getAbsolutePath());
- for (NNStorageListener listener : listeners) {
- listener.directoryAvailable(sd);
- }
-
- this.addStorageDir(sd); // restore
- this.removedStorageDirs.remove(sd);
- }
- } catch(IOException e) {
- LOG.warn("failed to restore " + sd.getRoot().getAbsolutePath(), e);
+ if(root.exists() && root.canWrite()) {
+ LOG.info("restoring dir " + sd.getRoot().getAbsolutePath());
+ this.addStorageDir(sd); // restore
+ this.removedStorageDirs.remove(sd);
}
}
}
@@ -283,9 +242,11 @@ List getRemovedStorageDirs() {
}
/**
- * Set the storage directories which will be used. NNStorage.close() should
- * be called before this to ensure any previous storage directories have been
- * freed.
+ * Set the storage directories which will be used. This should only ever be
+ * called from inside NNStorage. However, it needs to remain package private
+ * for testing, as StorageDirectories need to be reinitialised after using
+ * Mockito.spy() on this class, as Mockito doesn't work well with inner
+ * classes, such as StorageDirectory in this case.
*
* Synchronized due to initialization of storageDirs and removedStorageDirs.
*
@@ -293,6 +254,7 @@ List getRemovedStorageDirs() {
* @param fsEditsDirs Locations to store edit logs.
* @throws IOException
*/
+ @VisibleForTesting
synchronized void setStorageDirectories(Collection fsNameDirs,
Collection fsEditsDirs)
throws IOException {
@@ -411,110 +373,84 @@ Collection getDirectories(NameNodeDirType dirType)
}
return list;
}
-
+
/**
- * Determine the checkpoint time of the specified StorageDirectory
+ * Determine the last transaction ID noted in this storage directory.
+ * This txid is stored in a special seen_txid file since it might not
+ * correspond to the latest image or edit log. For example, an image-only
+ * directory will have this txid incremented when edits logs roll, even
+ * though the edits logs are in a different directory.
*
* @param sd StorageDirectory to check
- * @return If file exists and can be read, last checkpoint time. If not, 0L.
+ * @return If file exists and can be read, last recorded txid. If not, 0L.
* @throws IOException On errors processing file pointed to by sd
*/
- long readCheckpointTime(StorageDirectory sd) throws IOException {
- File timeFile = getStorageFile(sd, NameNodeFile.TIME);
- long timeStamp = 0L;
- if (timeFile.exists() && timeFile.canRead()) {
- DataInputStream in = new DataInputStream(new FileInputStream(timeFile));
+ static long readTransactionIdFile(StorageDirectory sd) throws IOException {
+ File txidFile = getStorageFile(sd, NameNodeFile.SEEN_TXID);
+ long txid = 0L;
+ if (txidFile.exists() && txidFile.canRead()) {
+ BufferedReader br = new BufferedReader(new FileReader(txidFile));
try {
- timeStamp = in.readLong();
+ txid = Long.valueOf(br.readLine());
} finally {
- in.close();
+ IOUtils.cleanup(LOG, br);
}
}
- return timeStamp;
+ return txid;
}
-
+
/**
* Write last checkpoint time into a separate file.
*
* @param sd
* @throws IOException
*/
- public void writeCheckpointTime(StorageDirectory sd) throws IOException {
- if (checkpointTime < 0L)
- return; // do not write negative time
- File timeFile = getStorageFile(sd, NameNodeFile.TIME);
- if (timeFile.exists() && ! timeFile.delete()) {
- LOG.error("Cannot delete chekpoint time file: "
- + timeFile.getCanonicalPath());
- }
- FileOutputStream fos = new FileOutputStream(timeFile);
- DataOutputStream out = new DataOutputStream(fos);
+ void writeTransactionIdFile(StorageDirectory sd, long txid) throws IOException {
+ Preconditions.checkArgument(txid >= 0, "bad txid: " + txid);
+
+ File txIdFile = getStorageFile(sd, NameNodeFile.SEEN_TXID);
+ OutputStream fos = new AtomicFileOutputStream(txIdFile);
try {
- out.writeLong(checkpointTime);
- out.flush();
- fos.getChannel().force(true);
+ fos.write(String.valueOf(txid).getBytes());
+ fos.write('\n');
} finally {
- out.close();
+ IOUtils.cleanup(LOG, fos);
}
}
/**
- * Record new checkpoint time in order to
- * distinguish healthy directories from the removed ones.
- * If there is an error writing new checkpoint time, the corresponding
- * storage directory is removed from the list.
+ * Set the transaction ID of the last checkpoint
*/
- public void incrementCheckpointTime() {
- setCheckpointTimeInStorage(checkpointTime + 1);
+ void setMostRecentCheckpointTxId(long txid) {
+ this.mostRecentCheckpointTxId = txid;
}
/**
- * The age of the namespace state.
- * Reflects the latest time the image was saved.
- * Modified with every save or a checkpoint.
- * Persisted in VERSION file.
- *
- * @return the current checkpoint time.
+ * Return the transaction ID of the last checkpoint.
*/
- public long getCheckpointTime() {
- return checkpointTime;
+ long getMostRecentCheckpointTxId() {
+ return mostRecentCheckpointTxId;
}
/**
- * Set the checkpoint time.
- *
- * This method does not persist the checkpoint time to storage immediately.
+ * Write a small file in all available storage directories that
+ * indicates that the namespace has reached some given transaction ID.
*
- * @see #setCheckpointTimeInStorage
- * @param newCpT the new checkpoint time.
+ * This is used when the image is loaded to avoid accidental rollbacks
+ * in the case where an edit log is fully deleted but there is no
+ * checkpoint. See TestNameEditsConfigs.testNameEditsConfigsFailure()
+ * @param txid the txid that has been reached
*/
- public void setCheckpointTime(long newCpT) {
- checkpointTime = newCpT;
- }
-
- /**
- * Set the current checkpoint time. Writes the new checkpoint
- * time to all available storage directories.
- * @param newCpT The new checkpoint time.
- */
- public void setCheckpointTimeInStorage(long newCpT) {
- checkpointTime = newCpT;
- // Write new checkpoint time in all storage directories
- for(Iterator it =
- dirIterator(); it.hasNext();) {
- StorageDirectory sd = it.next();
+ public void writeTransactionIdFileToStorage(long txid) {
+ // Write txid marker in all storage directories
+ for (StorageDirectory sd : storageDirs) {
try {
- writeCheckpointTime(sd);
+ writeTransactionIdFile(sd, txid);
} catch(IOException e) {
// Close any edits stream associated with this dir and remove directory
- LOG.warn("incrementCheckpointTime failed on "
- + sd.getRoot().getPath() + ";type="+sd.getStorageDirType());
- try {
- reportErrorsOnDirectory(sd);
- } catch (IOException ioe) {
- LOG.error("Failed to report and remove NN storage directory "
- + sd.getRoot().getPath(), ioe);
- }
+ LOG.warn("writeTransactionIdToStorage failed on " + sd,
+ e);
+ reportErrorsOnDirectory(sd);
}
}
}
@@ -525,11 +461,11 @@ public void setCheckpointTimeInStorage(long newCpT) {
*
* @return List of filenames to save checkpoints to.
*/
- public File[] getFsImageNameCheckpoint() {
+ public File[] getFsImageNameCheckpoint(long txid) {
ArrayList list = new ArrayList();
for (Iterator it =
dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
- list.add(getStorageFile(it.next(), NameNodeFile.IMAGE_NEW));
+ list.add(getStorageFile(it.next(), NameNodeFile.IMAGE_NEW, txid));
}
return list.toArray(new File[list.size()]);
}
@@ -538,51 +474,24 @@ public File[] getFsImageNameCheckpoint() {
* Return the name of the image file.
* @return The name of the first image file.
*/
- public File getFsImageName() {
+ public File getFsImageName(long txid) {
StorageDirectory sd = null;
for (Iterator it =
dirIterator(NameNodeDirType.IMAGE); it.hasNext();) {
sd = it.next();
- File fsImage = getStorageFile(sd, NameNodeFile.IMAGE);
+ File fsImage = getStorageFile(sd, NameNodeFile.IMAGE, txid);
if(sd.getRoot().canRead() && fsImage.exists())
return fsImage;
}
return null;
}
- /**
- * @return The name of the first editlog file.
- */
- public File getFsEditName() throws IOException {
- for (Iterator it
- = dirIterator(NameNodeDirType.EDITS); it.hasNext();) {
- StorageDirectory sd = it.next();
- if(sd.getRoot().canRead())
- return getEditFile(sd);
- }
- return null;
- }
-
- /**
- * @return The name of the first time file.
- */
- public File getFsTimeName() {
- StorageDirectory sd = null;
- // NameNodeFile.TIME shoul be same on all directories
- for (Iterator it =
- dirIterator(); it.hasNext();)
- sd = it.next();
- return getStorageFile(sd, NameNodeFile.TIME);
- }
-
/** Create new dfs name directory. Caution: this destroys all files
* in this filesystem. */
private void format(StorageDirectory sd) throws IOException {
sd.clearDirectory(); // create currrent dir
- for (NNStorageListener listener : listeners) {
- listener.formatOccurred(sd);
- }
writeProperties(sd);
+ writeTransactionIdFile(sd, 0);
LOG.info("Storage directory " + sd.getRoot()
+ " has been successfully formatted.");
@@ -597,7 +506,6 @@ public void format(String clusterId) throws IOException {
this.clusterID = clusterId;
this.blockpoolID = newBlockPoolID();
this.cTime = 0L;
- this.setCheckpointTime(now());
for (Iterator it =
dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
@@ -624,50 +532,6 @@ private int newNamespaceID() {
return newID;
}
-
- /**
- * Move {@code current} to {@code lastcheckpoint.tmp} and
- * recreate empty {@code current}.
- * {@code current} is moved only if it is well formatted,
- * that is contains VERSION file.
- *
- * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getLastCheckpointTmp()
- * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getPreviousCheckpoint()
- */
- protected void moveCurrent(StorageDirectory sd)
- throws IOException {
- File curDir = sd.getCurrentDir();
- File tmpCkptDir = sd.getLastCheckpointTmp();
- // mv current -> lastcheckpoint.tmp
- // only if current is formatted - has VERSION file
- if(sd.getVersionFile().exists()) {
- assert curDir.exists() : curDir + " directory must exist.";
- assert !tmpCkptDir.exists() : tmpCkptDir + " directory must not exist.";
- rename(curDir, tmpCkptDir);
- }
- // recreate current
- if(!curDir.exists() && !curDir.mkdir())
- throw new IOException("Cannot create directory " + curDir);
- }
-
- /**
- * Move {@code lastcheckpoint.tmp} to {@code previous.checkpoint}
- *
- * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getPreviousCheckpoint()
- * @see org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory#getLastCheckpointTmp()
- */
- protected void moveLastCheckpoint(StorageDirectory sd)
- throws IOException {
- File tmpCkptDir = sd.getLastCheckpointTmp();
- File prevCkptDir = sd.getPreviousCheckpoint();
- // remove previous.checkpoint
- if (prevCkptDir.exists())
- deleteDir(prevCkptDir);
- // mv lastcheckpoint.tmp -> previous.checkpoint
- if(tmpCkptDir.exists())
- rename(tmpCkptDir, prevCkptDir);
- }
-
@Override // Storage
protected void setFieldsFromProperties(
Properties props, StorageDirectory sd) throws IOException {
@@ -689,26 +553,35 @@ protected void setFieldsFromProperties(
setDistributedUpgradeState(
sDUS == null? false : Boolean.parseBoolean(sDUS),
sDUV == null? getLayoutVersion() : Integer.parseInt(sDUV));
-
- String sMd5 = props.getProperty(MESSAGE_DIGEST_PROPERTY);
- if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM, layoutVersion)) {
- if (sMd5 == null) {
- throw new InconsistentFSStateException(sd.getRoot(),
- "file " + STORAGE_FILE_VERSION
- + " does not have MD5 image digest.");
- }
- this.imageDigest = new MD5Hash(sMd5);
- } else if (sMd5 != null) {
- throw new InconsistentFSStateException(sd.getRoot(),
- "file " + STORAGE_FILE_VERSION +
- " has image MD5 digest when version is " + layoutVersion);
- }
-
- this.setCheckpointTime(readCheckpointTime(sd));
+ setDeprecatedPropertiesForUpgrade(props);
}
/**
- * Write last checkpoint time and version file into the storage directory.
+ * Pull any properties out of the VERSION file that are from older
+ * versions of HDFS and only necessary during upgrade.
+ */
+ private void setDeprecatedPropertiesForUpgrade(Properties props) {
+ deprecatedProperties = new HashMap();
+ String md5 = props.getProperty(DEPRECATED_MESSAGE_DIGEST_PROPERTY);
+ if (md5 != null) {
+ deprecatedProperties.put(DEPRECATED_MESSAGE_DIGEST_PROPERTY, md5);
+ }
+ }
+
+ /**
+ * Return a property that was stored in an earlier version of HDFS.
+ *
+ * This should only be used during upgrades.
+ */
+ String getDeprecatedProperty(String prop) {
+ assert getLayoutVersion() > FSConstants.LAYOUT_VERSION :
+ "getDeprecatedProperty should only be done when loading " +
+ "storage from past versions during upgrade.";
+ return deprecatedProperties.get(prop);
+ }
+
+ /**
+ * Write version file into the storage directory.
*
* The version file should always be written last.
* Missing or corrupted version file indicates that
@@ -733,50 +606,109 @@ protected void setPropertiesFromFields(Properties props,
props.setProperty("distributedUpgradeVersion",
Integer.toString(uVersion));
}
- if (LayoutVersion.supports(Feature.FSIMAGE_CHECKSUM, layoutVersion)) {
- // Though the current NN supports this feature, this function
- // is called with old layoutVersions from the upgrade tests.
- if (imageDigest == null) {
- // May be null on the first save after an upgrade.
- imageDigest = MD5Hash.digest(
- new FileInputStream(getStorageFile(sd, NameNodeFile.IMAGE)));
- }
- props.setProperty(MESSAGE_DIGEST_PROPERTY, imageDigest.toString());
- }
-
- writeCheckpointTime(sd);
}
-
+
+ static File getStorageFile(StorageDirectory sd, NameNodeFile type, long imageTxId) {
+ return new File(sd.getCurrentDir(),
+ String.format("%s_%019d", type.getName(), imageTxId));
+ }
+
/**
- * @return A File of 'type' in storage directory 'sd'.
+ * Get a storage file for one of the files that doesn't need a txid associated
+ * (e.g version, seen_txid)
*/
static File getStorageFile(StorageDirectory sd, NameNodeFile type) {
return new File(sd.getCurrentDir(), type.getName());
}
+ @VisibleForTesting
+ public static String getCheckpointImageFileName(long txid) {
+ return String.format("%s_%019d",
+ NameNodeFile.IMAGE_NEW.getName(), txid);
+ }
+
+ @VisibleForTesting
+ public static String getImageFileName(long txid) {
+ return String.format("%s_%019d",
+ NameNodeFile.IMAGE.getName(), txid);
+ }
+
+ @VisibleForTesting
+ public static String getInProgressEditsFileName(long startTxId) {
+ return String.format("%s_%019d", NameNodeFile.EDITS_INPROGRESS.getName(),
+ startTxId);
+ }
+
+ static File getInProgressEditsFile(StorageDirectory sd, long startTxId) {
+ return new File(sd.getCurrentDir(), getInProgressEditsFileName(startTxId));
+ }
+
+ static File getFinalizedEditsFile(StorageDirectory sd,
+ long startTxId, long endTxId) {
+ return new File(sd.getCurrentDir(),
+ getFinalizedEditsFileName(startTxId, endTxId));
+ }
+
+ static File getImageFile(StorageDirectory sd, long txid) {
+ return new File(sd.getCurrentDir(),
+ getImageFileName(txid));
+ }
+
+ @VisibleForTesting
+ public static String getFinalizedEditsFileName(long startTxId, long endTxId) {
+ return String.format("%s_%019d-%019d", NameNodeFile.EDITS.getName(),
+ startTxId, endTxId);
+ }
+
/**
- * @return A editlog File in storage directory 'sd'.
+ * Return the first readable finalized edits file for the given txid.
*/
- File getEditFile(StorageDirectory sd) {
- return getStorageFile(sd, NameNodeFile.EDITS);
+ File findFinalizedEditsFile(long startTxId, long endTxId)
+ throws IOException {
+ File ret = findFile(NameNodeDirType.EDITS,
+ getFinalizedEditsFileName(startTxId, endTxId));
+ if (ret == null) {
+ throw new IOException(
+ "No edits file for txid " + startTxId + "-" + endTxId + " exists!");
+ }
+ return ret;
+ }
+
+ /**
+ * Return the first readable image file for the given txid, or null
+ * if no such image can be found
+ */
+ File findImageFile(long txid) throws IOException {
+ return findFile(NameNodeDirType.IMAGE,
+ getImageFileName(txid));
}
/**
- * @return A temporary editlog File in storage directory 'sd'.
+ * Return the first readable storage file of the given name
+ * across any of the 'current' directories in SDs of the
+ * given type, or null if no such file exists.
*/
- File getEditNewFile(StorageDirectory sd) {
- return getStorageFile(sd, NameNodeFile.EDITS_NEW);
+ private File findFile(NameNodeDirType dirType, String name) {
+ for (StorageDirectory sd : dirIterable(dirType)) {
+ File candidate = new File(sd.getCurrentDir(), name);
+ if (sd.getCurrentDir().canRead() &&
+ candidate.exists()) {
+ return candidate;
+ }
+ }
+ return null;
}
/**
- * @return A list of all Files of 'type' in available storage directories.
+ * @return A list of the given File in every available storage directory,
+ * regardless of whether it might exist.
*/
- Collection getFiles(NameNodeFile type, NameNodeDirType dirType) {
+ List getFiles(NameNodeDirType dirType, String fileName) {
ArrayList list = new ArrayList();
Iterator it =
(dirType == null) ? dirIterator() : dirIterator(dirType);
for ( ;it.hasNext(); ) {
- list.add(getStorageFile(it.next(), type));
+ list.add(new File(it.next().getCurrentDir(), fileName));
}
return list;
}
@@ -809,7 +741,9 @@ int getDistributedUpgradeVersion() {
* @param uVersion the new version.
*/
private void setDistributedUpgradeState(boolean uState, int uVersion) {
- upgradeManager.setUpgradeState(uState, uVersion);
+ if (upgradeManager != null) {
+ upgradeManager.setUpgradeState(uState, uVersion);
+ }
}
/**
@@ -849,33 +783,6 @@ void initializeDistributedUpgrade() throws IOException {
+ FSConstants.LAYOUT_VERSION + " is initialized.");
}
- /**
- * Set the digest for the latest image stored by NNStorage.
- * @param digest The digest for the image.
- */
- void setImageDigest(MD5Hash digest) {
- this.imageDigest = digest;
- }
-
- /**
- * Get the digest for the latest image storage by NNStorage.
- * @return The digest for the latest image.
- */
- MD5Hash getImageDigest() {
- return imageDigest;
- }
-
- /**
- * Register a listener. The listener will be notified of changes to the list
- * of available storage directories.
- *
- * @see NNStorageListener
- * @param sel A storage listener.
- */
- void registerListener(NNStorageListener sel) {
- listeners.add(sel);
- }
-
/**
* Disable the check for pre-upgradable layouts. Needed for BackupImage.
* @param val Whether to disable the preupgradeable layout check.
@@ -890,7 +797,7 @@ void setDisablePreUpgradableLayoutCheck(boolean val) {
* @param sds A list of storage directories to mark as errored.
* @throws IOException
*/
- void reportErrorsOnDirectories(List sds) throws IOException {
+ void reportErrorsOnDirectories(List sds) {
for (StorageDirectory sd : sds) {
reportErrorsOnDirectory(sd);
}
@@ -904,17 +811,12 @@ void reportErrorsOnDirectories(List sds) throws IOException {
* @param sd A storage directory to mark as errored.
* @throws IOException
*/
- void reportErrorsOnDirectory(StorageDirectory sd)
- throws IOException {
+ void reportErrorsOnDirectory(StorageDirectory sd) {
LOG.error("Error reported on storage directory " + sd);
String lsd = listStorageDirectories();
LOG.debug("current list of storage dirs:" + lsd);
- for (NNStorageListener listener : listeners) {
- listener.errorOccurred(sd);
- }
-
LOG.warn("About to remove corresponding storage: "
+ sd.getRoot().getAbsolutePath());
try {
@@ -927,8 +829,7 @@ void reportErrorsOnDirectory(StorageDirectory sd)
if (this.storageDirs.remove(sd)) {
this.removedStorageDirs.add(sd);
}
- incrementCheckpointTime();
-
+
lsd = listStorageDirectories();
LOG.debug("at the end current list of storage dirs:" + lsd);
}
@@ -967,6 +868,29 @@ void processStartupOptionsForUpgrade(StartupOption startOpt, int layoutVersion)
}
}
+ /**
+ * Report that an IOE has occurred on some file which may
+ * or may not be within one of the NN image storage directories.
+ */
+ void reportErrorOnFile(File f) {
+ // We use getAbsolutePath here instead of getCanonicalPath since we know
+ // that there is some IO problem on that drive.
+ // getCanonicalPath may need to call stat() or readlink() and it's likely
+ // those calls would fail due to the same underlying IO problem.
+ String absPath = f.getAbsolutePath();
+ for (StorageDirectory sd : storageDirs) {
+ String dirPath = sd.getRoot().getAbsolutePath();
+ if (!dirPath.endsWith("/")) {
+ dirPath += "/";
+ }
+ if (absPath.startsWith(dirPath)) {
+ reportErrorsOnDirectory(sd);
+ return;
+ }
+ }
+
+ }
+
/**
* Generate new clusterID.
*
@@ -1065,4 +989,67 @@ private void setBlockPoolID(File storage, String bpid)
public String getBlockPoolID() {
return blockpoolID;
}
+
+ /**
+ * Iterate over all current storage directories, inspecting them
+ * with the given inspector.
+ */
+ void inspectStorageDirs(FSImageStorageInspector inspector)
+ throws IOException {
+
+ // Process each of the storage directories to find the pair of
+ // newest image file and edit file
+ for (Iterator it = dirIterator(); it.hasNext();) {
+ StorageDirectory sd = it.next();
+ inspector.inspectDirectory(sd);
+ }
+ }
+
+ /**
+ * Iterate over all of the storage dirs, reading their contents to determine
+ * their layout versions. Returns an FSImageStorageInspector which has
+ * inspected each directory.
+ *
+ * Note: this can mutate the storage info fields (ctime, version, etc).
+ * @throws IOException if no valid storage dirs are found
+ */
+ FSImageStorageInspector readAndInspectDirs()
+ throws IOException {
+ int minLayoutVersion = Integer.MAX_VALUE; // the newest
+ int maxLayoutVersion = Integer.MIN_VALUE; // the oldest
+
+ // First determine what range of layout versions we're going to inspect
+ for (Iterator it = dirIterator();
+ it.hasNext();) {
+ StorageDirectory sd = it.next();
+ if (!sd.getVersionFile().exists()) {
+ FSImage.LOG.warn("Storage directory " + sd + " contains no VERSION file. Skipping...");
+ continue;
+ }
+ readProperties(sd); // sets layoutVersion
+ minLayoutVersion = Math.min(minLayoutVersion, getLayoutVersion());
+ maxLayoutVersion = Math.max(maxLayoutVersion, getLayoutVersion());
+ }
+
+ if (minLayoutVersion > maxLayoutVersion) {
+ throw new IOException("No storage directories contained VERSION information");
+ }
+ assert minLayoutVersion <= maxLayoutVersion;
+
+ // If we have any storage directories with the new layout version
+ // (ie edits_) then use the new inspector, which will ignore
+ // the old format dirs.
+ FSImageStorageInspector inspector;
+ if (LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, minLayoutVersion)) {
+ inspector = new FSImageTransactionalStorageInspector();
+ if (!LayoutVersion.supports(Feature.TXID_BASED_LAYOUT, maxLayoutVersion)) {
+ FSImage.LOG.warn("Ignoring one or more storage directories with old layouts");
+ }
+ } else {
+ inspector = new FSImagePreTransactionalStorageInspector();
+ }
+
+ inspectStorageDirs(inspector);
+ return inspector;
+ }
}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java
new file mode 100644
index 00000000000..4b5f9a90902
--- /dev/null
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NNStorageRetentionManager.java
@@ -0,0 +1,151 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+import java.util.TreeSet;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.FoundEditLog;
+import org.apache.hadoop.hdfs.server.namenode.FSImageTransactionalStorageInspector.FoundFSImage;
+import org.apache.hadoop.hdfs.util.MD5FileUtils;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+
+/**
+ * The NNStorageRetentionManager is responsible for inspecting the storage
+ * directories of the NN and enforcing a retention policy on checkpoints
+ * and edit logs.
+ *
+ * It delegates the actual removal of files to a StoragePurger
+ * implementation, which might delete the files or instead copy them to
+ * a filer or HDFS for later analysis.
+ */
+public class NNStorageRetentionManager {
+
+ private final int numCheckpointsToRetain;
+ private static final Log LOG = LogFactory.getLog(
+ NNStorageRetentionManager.class);
+ private final NNStorage storage;
+ private final StoragePurger purger;
+ private final FSEditLog editLog;
+
+ public NNStorageRetentionManager(
+ Configuration conf,
+ NNStorage storage,
+ FSEditLog editLog,
+ StoragePurger purger) {
+ this.numCheckpointsToRetain = conf.getInt(
+ DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_KEY,
+ DFSConfigKeys.DFS_NAMENODE_NUM_CHECKPOINTS_RETAINED_DEFAULT);
+ this.storage = storage;
+ this.editLog = editLog;
+ this.purger = purger;
+ }
+
+ public NNStorageRetentionManager(Configuration conf, NNStorage storage,
+ FSEditLog editLog) {
+ this(conf, storage, editLog, new DeletionStoragePurger());
+ }
+
+ public void purgeOldStorage() throws IOException {
+ FSImageTransactionalStorageInspector inspector =
+ new FSImageTransactionalStorageInspector();
+ storage.inspectStorageDirs(inspector);
+
+ long minImageTxId = getImageTxIdToRetain(inspector);
+ purgeCheckpointsOlderThan(inspector, minImageTxId);
+ // If fsimage_N is the image we want to keep, then we need to keep
+ // all txns > N. We can remove anything < N+1, since fsimage_N
+ // reflects the state up to and including N.
+ editLog.purgeLogsOlderThan(minImageTxId + 1, purger);
+ }
+
+ private void purgeCheckpointsOlderThan(
+ FSImageTransactionalStorageInspector inspector,
+ long minTxId) {
+ for (FoundFSImage image : inspector.getFoundImages()) {
+ if (image.getTxId() < minTxId) {
+ LOG.info("Purging old image " + image);
+ purger.purgeImage(image);
+ }
+ }
+ }
+
+ /**
+ * @param inspector inspector that has already inspected all storage dirs
+ * @return the transaction ID corresponding to the oldest checkpoint
+ * that should be retained.
+ */
+ private long getImageTxIdToRetain(FSImageTransactionalStorageInspector inspector) {
+
+ List images = inspector.getFoundImages();
+ TreeSet imageTxIds = Sets.newTreeSet();
+ for (FoundFSImage image : images) {
+ imageTxIds.add(image.getTxId());
+ }
+
+ List imageTxIdsList = Lists.newArrayList(imageTxIds);
+ if (imageTxIdsList.isEmpty()) {
+ return 0;
+ }
+
+ Collections.reverse(imageTxIdsList);
+ int toRetain = Math.min(numCheckpointsToRetain, imageTxIdsList.size());
+ long minTxId = imageTxIdsList.get(toRetain - 1);
+ LOG.info("Going to retain " + toRetain + " images with txid >= " +
+ minTxId);
+ return minTxId;
+ }
+
+ /**
+ * Interface responsible for disposing of old checkpoints and edit logs.
+ */
+ static interface StoragePurger {
+ void purgeLog(FoundEditLog log);
+ void purgeImage(FoundFSImage image);
+ }
+
+ static class DeletionStoragePurger implements StoragePurger {
+ @Override
+ public void purgeLog(FoundEditLog log) {
+ deleteOrWarn(log.getFile());
+ }
+
+ @Override
+ public void purgeImage(FoundFSImage image) {
+ deleteOrWarn(image.getFile());
+ deleteOrWarn(MD5FileUtils.getDigestFileForFile(image.getFile()));
+ }
+
+ private static void deleteOrWarn(File file) {
+ if (!file.delete()) {
+ // It's OK if we fail to delete something -- we'll catch it
+ // next time we swing through this directory.
+ LOG.warn("Could not delete " + file);
+ }
+ }
+ }
+}
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
index 4192c42af25..7450afc03aa 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java
@@ -78,6 +78,7 @@
import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.NodeRegistration;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.hdfs.server.protocol.UpgradeCommand;
import org.apache.hadoop.io.EnumSetWritable;
import org.apache.hadoop.io.Text;
@@ -382,7 +383,7 @@ NamenodeRegistration setRegistration() {
nodeRegistration = new NamenodeRegistration(
getHostPortString(rpcAddress),
getHostPortString(getHttpAddress()),
- getFSImage().getStorage(), getRole(), getFSImage().getStorage().getCheckpointTime());
+ getFSImage().getStorage(), getRole());
return nodeRegistration;
}
@@ -647,8 +648,9 @@ public void errorReport(NamenodeRegistration registration,
public NamenodeRegistration register(NamenodeRegistration registration)
throws IOException {
verifyVersion(registration.getVersion());
- namesystem.registerBackupNode(registration);
- return setRegistration();
+ NamenodeRegistration myRegistration = setRegistration();
+ namesystem.registerBackupNode(registration, myRegistration);
+ return myRegistration;
}
@Override // NamenodeProtocol
@@ -669,22 +671,6 @@ public void endCheckpoint(NamenodeRegistration registration,
namesystem.endCheckpoint(registration, sig);
}
- @Override // NamenodeProtocol
- public long journalSize(NamenodeRegistration registration)
- throws IOException {
- verifyRequest(registration);
- return namesystem.getEditLogSize();
- }
-
- @Override // NamenodeProtocol
- public void journal(NamenodeRegistration registration,
- int jAction,
- int length,
- byte[] args) throws IOException {
- // Active name-node cannot journal.
- throw new UnsupportedActionException("journal");
- }
-
@Override // ClientProtocol
public Token getDelegationToken(Text renewer)
throws IOException {
@@ -1056,21 +1042,20 @@ public void refreshNodes() throws IOException {
namesystem.refreshNodes(new HdfsConfiguration());
}
- @Deprecated // NamenodeProtocol
- public long getEditLogSize() throws IOException {
- return namesystem.getEditLogSize();
+ @Override // NamenodeProtocol
+ public long getTransactionID() {
+ return namesystem.getTransactionID();
}
- @Deprecated
@Override // NamenodeProtocol
public CheckpointSignature rollEditLog() throws IOException {
return namesystem.rollEditLog();
}
-
- @Deprecated
- @Override // NamenodeProtocol
- public void rollFsImage(CheckpointSignature sig) throws IOException {
- namesystem.rollFSImage(sig);
+
+ @Override
+ public RemoteEditLogManifest getEditLogManifest(long sinceTxId)
+ throws IOException {
+ return namesystem.getEditLogManifest(sinceTxId);
}
@Override // ClientProtocol
@@ -1279,26 +1264,11 @@ public void verifyVersion(int version) throws IOException {
if (version != LAYOUT_VERSION)
throw new IncorrectVersionException(version, "data node");
}
-
- /**
- * Returns the name of the fsImage file
- */
- public File getFsImageName() throws IOException {
- return getFSImage().getStorage().getFsImageName();
- }
public FSImage getFSImage() {
return namesystem.dir.fsImage;
}
- /**
- * Returns the name of the fsImage file uploaded by periodic
- * checkpointing
- */
- public File[] getFsImageNameCheckpoint() throws IOException {
- return getFSImage().getStorage().getFsImageNameCheckpoint();
- }
-
/**
* Returns the address on which the NameNodes is listening to.
* @return namenode rpc address
@@ -1374,20 +1344,16 @@ private static boolean format(Configuration conf,
}
System.out.println("Formatting using clusterid: " + clusterId);
- FSImage fsImage = new FSImage(dirsToFormat, editDirsToFormat);
+ FSImage fsImage = new FSImage(conf, null, dirsToFormat, editDirsToFormat);
FSNamesystem nsys = new FSNamesystem(fsImage, conf);
- nsys.dir.fsImage.getStorage().format(clusterId);
+ nsys.dir.fsImage.format(clusterId);
return false;
}
private static boolean finalize(Configuration conf,
boolean isConfirmationNeeded
) throws IOException {
- Collection dirsToFormat = FSNamesystem.getNamespaceDirs(conf);
- Collection editDirsToFormat =
- FSNamesystem.getNamespaceEditsDirs(conf);
- FSNamesystem nsys = new FSNamesystem(new FSImage(dirsToFormat,
- editDirsToFormat), conf);
+ FSNamesystem nsys = new FSNamesystem(new FSImage(conf), conf);
System.err.print(
"\"finalize\" will remove the previous state of the files system.\n"
+ "Recent upgrade will become permanent.\n"
diff --git a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
index ef3fd456267..f126f17eeb8 100644
--- a/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
+++ b/hdfs/src/java/org/apache/hadoop/hdfs/server/namenode/SecondaryNameNode.java
@@ -23,11 +23,19 @@
import java.net.URI;
import java.security.PrivilegedAction;
import java.security.PrivilegedExceptionAction;
-import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
+import java.util.List;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.PosixParser;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
@@ -43,10 +51,11 @@
import org.apache.hadoop.hdfs.server.common.JspHelper;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.common.Storage.StorageState;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
-import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
+import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
import org.apache.hadoop.http.HttpServer;
+import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -60,6 +69,9 @@
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.StringUtils;
+import com.google.common.annotations.VisibleForTesting;
+import com.google.common.collect.ImmutableList;
+
/**********************************************************
* The Secondary NameNode is a helper to the primary NameNode.
* The Secondary is responsible for supporting periodic checkpoints
@@ -98,11 +110,18 @@ public class SecondaryNameNode implements Runnable {
private int imagePort;
private String infoBindAddress;
- private FSNamesystem namesystem;
private Collection checkpointDirs;
private Collection checkpointEditsDirs;
+
+ /** How often to checkpoint regardless of number of txns */
private long checkpointPeriod; // in seconds
- private long checkpointSize; // size (in bytes) of current Edit Log
+
+ /** How often to poll the NN to check checkpointTxnCount */
+ private long checkpointCheckPeriod; // in seconds
+
+ /** checkpoint once every this many transactions, regardless of time */
+ private long checkpointTxnCount;
+
/** {@inheritDoc} */
public String toString() {
@@ -111,23 +130,49 @@ public String toString() {
+ "\nStart Time : " + new Date(starttime)
+ "\nLast Checkpoint Time : " + (lastCheckpointTime == 0? "--": new Date(lastCheckpointTime))
+ "\nCheckpoint Period : " + checkpointPeriod + " seconds"
- + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointSize)
- + " (= " + checkpointSize + " bytes)"
+ + "\nCheckpoint Size : " + StringUtils.byteDesc(checkpointTxnCount)
+ + " (= " + checkpointTxnCount + " bytes)"
+ "\nCheckpoint Dirs : " + checkpointDirs
+ "\nCheckpoint Edits Dirs: " + checkpointEditsDirs;
}
+ @VisibleForTesting
FSImage getFSImage() {
return checkpointImage;
}
+
+ @VisibleForTesting
+ void setFSImage(CheckpointStorage image) {
+ this.checkpointImage = image;
+ }
+
+ @VisibleForTesting
+ NamenodeProtocol getNameNode() {
+ return namenode;
+ }
+
+ @VisibleForTesting
+ void setNameNode(NamenodeProtocol namenode) {
+ this.namenode = namenode;
+ }
+ @VisibleForTesting
+ List getCheckpointDirs() {
+ return ImmutableList.copyOf(checkpointDirs);
+ }
+
/**
* Create a connection to the primary namenode.
*/
public SecondaryNameNode(Configuration conf) throws IOException {
+ this(conf, new CommandLineOpts());
+ }
+
+ public SecondaryNameNode(Configuration conf,
+ CommandLineOpts commandLineOpts) throws IOException {
try {
NameNode.initializeGenericKeys(conf);
- initialize(conf);
+ initialize(conf, commandLineOpts);
} catch(IOException e) {
shutdown();
LOG.fatal("Failed to start secondary namenode. ", e);
@@ -143,8 +188,10 @@ public static InetSocketAddress getHttpAddress(Configuration conf) {
/**
* Initialize SecondaryNameNode.
+ * @param commandLineOpts
*/
- private void initialize(final Configuration conf) throws IOException {
+ private void initialize(final Configuration conf,
+ CommandLineOpts commandLineOpts) throws IOException {
final InetSocketAddress infoSocAddr = getHttpAddress(conf);
infoBindAddress = infoSocAddr.getHostName();
UserGroupInformation.setConfiguration(conf);
@@ -171,14 +218,19 @@ private void initialize(final Configuration conf) throws IOException {
"/tmp/hadoop/dfs/namesecondary");
checkpointEditsDirs = FSImage.getCheckpointEditsDirs(conf,
"/tmp/hadoop/dfs/namesecondary");
- checkpointImage = new CheckpointStorage(conf);
- checkpointImage.recoverCreate(checkpointDirs, checkpointEditsDirs);
+ checkpointImage = new CheckpointStorage(conf, checkpointDirs, checkpointEditsDirs);
+ checkpointImage.recoverCreate(commandLineOpts.shouldFormat());
// Initialize other scheduling parameters from the configuration
+ checkpointCheckPeriod = conf.getLong(
+ DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY,
+ DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_DEFAULT);
+
checkpointPeriod = conf.getLong(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT);
- checkpointSize = conf.getLong(DFS_NAMENODE_CHECKPOINT_SIZE_KEY,
- DFS_NAMENODE_CHECKPOINT_SIZE_DEFAULT);
+ checkpointTxnCount = conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
+ DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
+ warnForDeprecatedConfigs(conf);
// initialize the webserver for uploading files.
// Kerberized SSL servers must be run from the host principal...
@@ -204,8 +256,8 @@ public HttpServer run() throws IOException, InterruptedException {
System.setProperty("https.cipherSuites",
Krb5AndCertsSslSocketConnector.KRB5_CIPHER_SUITES.get(0));
InetSocketAddress secInfoSocAddr =
- NetUtils.createSocketAddr(infoBindAddress + ":"+ conf.get(
- "dfs.secondary.https.port", infoBindAddress + ":" + 0));
+ NetUtils.createSocketAddr(infoBindAddress + ":"+ conf.getInt(
+ "dfs.secondary.https.port", 443));
imagePort = secInfoSocAddr.getPort();
infoServer.addSslListener(secInfoSocAddr, conf, false, true);
}
@@ -227,15 +279,28 @@ public HttpServer run() throws IOException, InterruptedException {
// The web-server port can be ephemeral... ensure we have the correct info
infoPort = infoServer.getPort();
- if(!UserGroupInformation.isSecurityEnabled())
+ if (!UserGroupInformation.isSecurityEnabled()) {
imagePort = infoPort;
+ }
+
conf.set(DFS_NAMENODE_SECONDARY_HTTP_ADDRESS_KEY, infoBindAddress + ":" +infoPort);
LOG.info("Secondary Web-server up at: " + infoBindAddress + ":" +infoPort);
LOG.info("Secondary image servlet up at: " + infoBindAddress + ":" + imagePort);
- LOG.warn("Checkpoint Period :" + checkpointPeriod + " secs " +
+ LOG.info("Checkpoint Period :" + checkpointPeriod + " secs " +
"(" + checkpointPeriod/60 + " min)");
- LOG.warn("Log Size Trigger :" + checkpointSize + " bytes " +
- "(" + checkpointSize/1024 + " KB)");
+ LOG.info("Log Size Trigger :" + checkpointTxnCount + " txns");
+ }
+
+ static void warnForDeprecatedConfigs(Configuration conf) {
+ for (String key : ImmutableList.of(
+ "fs.checkpoint.size",
+ "dfs.namenode.checkpoint.size")) {
+ if (conf.get(key) != null) {
+ LOG.warn("Configuration key " + key + " is deprecated! Ignoring..." +
+ " Instead please specify a value for " +
+ DFS_NAMENODE_CHECKPOINT_TXNS_KEY);
+ }
+ }
}
/**
@@ -283,13 +348,10 @@ public Object run() {
public void doWork() {
//
- // Poll the Namenode (once every 5 minutes) to find the size of the
- // pending edit log.
+ // Poll the Namenode (once every checkpointCheckPeriod seconds) to find the
+ // number of transactions in the edit log that haven't yet been checkpointed.
//
- long period = 5 * 60; // 5 minutes
- if (checkpointPeriod < period) {
- period = checkpointPeriod;
- }
+ long period = Math.min(checkpointCheckPeriod, checkpointPeriod);
while (shouldRun) {
try {
@@ -307,8 +369,7 @@ public void doWork() {
long now = System.currentTimeMillis();
- long size = namenode.getEditLogSize();
- if (size >= checkpointSize ||
+ if (shouldCheckpointBasedOnCount() ||
now >= lastCheckpointTime + 1000 * checkpointPeriod) {
doCheckpoint();
lastCheckpointTime = now;
@@ -316,7 +377,6 @@ public void doWork() {
} catch (IOException e) {
LOG.error("Exception in doCheckpoint", e);
e.printStackTrace();
- checkpointImage.getStorage().imageDigest = null;
} catch (Throwable e) {
LOG.error("Throwable Exception in doCheckpoint", e);
e.printStackTrace();
@@ -331,49 +391,53 @@ public void doWork() {
* @return true if a new image has been downloaded and needs to be loaded
* @throws IOException
*/
- private boolean downloadCheckpointFiles(final CheckpointSignature sig
- ) throws IOException {
+ static boolean downloadCheckpointFiles(
+ final String nnHostPort,
+ final FSImage dstImage,
+ final CheckpointSignature sig,
+ final RemoteEditLogManifest manifest
+ ) throws IOException {
+
+ // Sanity check manifest - these could happen if, eg, someone on the
+ // NN side accidentally rmed the storage directories
+ if (manifest.getLogs().isEmpty()) {
+ throw new IOException("Found no edit logs to download on NN since txid "
+ + sig.mostRecentCheckpointTxId);
+ }
+
+ long expectedTxId = sig.mostRecentCheckpointTxId + 1;
+ if (manifest.getLogs().get(0).getStartTxId() != expectedTxId) {
+ throw new IOException("Bad edit log manifest (expected txid = " +
+ expectedTxId + ": " + manifest);
+ }
+
try {
Boolean b = UserGroupInformation.getCurrentUser().doAs(
new PrivilegedExceptionAction() {
@Override
public Boolean run() throws Exception {
- checkpointImage.getStorage().cTime = sig.cTime;
- checkpointImage.getStorage().setCheckpointTime(sig.checkpointTime);
+ dstImage.getStorage().cTime = sig.cTime;
// get fsimage
- String fileid;
- Collection list;
- File[] srcNames;
boolean downloadImage = true;
- if (sig.imageDigest.equals(
- checkpointImage.getStorage().imageDigest)) {
+ if (sig.mostRecentCheckpointTxId ==
+ dstImage.getStorage().getMostRecentCheckpointTxId()) {
downloadImage = false;
LOG.info("Image has not changed. Will not download image.");
} else {
- fileid = "getimage=1";
- list = checkpointImage.getStorage().getFiles(
- NameNodeFile.IMAGE, NameNodeDirType.IMAGE);
- srcNames = list.toArray(new File[list.size()]);
- assert srcNames.length > 0 : "No checkpoint targets.";
- TransferFsImage.getFileClient(fsName, fileid, srcNames, false);
- checkpointImage.getStorage().imageDigest = sig.imageDigest;
- LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
- srcNames[0].length() + " bytes.");
+ MD5Hash downloadedHash = TransferFsImage.downloadImageToStorage(
+ nnHostPort, sig.mostRecentCheckpointTxId, dstImage.getStorage(), true);
+ dstImage.saveDigestAndRenameCheckpointImage(
+ sig.mostRecentCheckpointTxId, downloadedHash);
}
// get edits file
- fileid = "getedit=1";
- list = getFSImage().getStorage().getFiles(
- NameNodeFile.EDITS, NameNodeDirType.EDITS);
- srcNames = list.toArray(new File[list.size()]);;
- assert srcNames.length > 0 : "No checkpoint targets.";
- TransferFsImage.getFileClient(fsName, fileid, srcNames, false);
- LOG.info("Downloaded file " + srcNames[0].getName() + " size " +
- srcNames[0].length() + " bytes.");
+ for (RemoteEditLog log : manifest.getLogs()) {
+ TransferFsImage.downloadEditsToStorage(
+ nnHostPort, log, dstImage.getStorage());
+ }
- checkpointImage.checkpointUploadDone();
return Boolean.valueOf(downloadImage);
}
});
@@ -387,18 +451,6 @@ InetSocketAddress getNameNodeAddress() {
return nameNodeAddr;
}
- /**
- * Copy the new fsimage into the NameNode
- */
- private void putFSImage(CheckpointSignature sig) throws IOException {
- String fileid = "putimage=1&port=" + imagePort +
- "&machine=" + infoBindAddress +
- "&token=" + sig.toString() +
- "&newChecksum=" + checkpointImage.getStorage().getImageDigest();
- LOG.info("Posted URL " + fsName + fileid);
- TransferFsImage.getFileClient(fsName, fileid, (File[])null, false);
- }
-
/**
* Returns the Jetty server that the Namenode is listening on.
*/
@@ -423,19 +475,39 @@ private String getInfoServer() throws IOException {
return configuredAddress;
}
}
+
+ /**
+ * Return the host:port of where this SecondaryNameNode is listening
+ * for image transfers
+ */
+ private InetSocketAddress getImageListenAddress() {
+ return new InetSocketAddress(infoBindAddress, imagePort);
+ }
/**
* Create a new checkpoint
* @return if the image is fetched from primary or not
*/
boolean doCheckpoint() throws IOException {
-
- // Do the required initialization of the merge work area.
- startCheckpoint();
-
+ checkpointImage.ensureCurrentDirExists();
+ NNStorage dstStorage = checkpointImage.getStorage();
+
// Tell the namenode to start logging transactions in a new edit file
// Returns a token that would be used to upload the merged image.
CheckpointSignature sig = namenode.rollEditLog();
+
+ // Make sure we're talking to the same NN!
+ if (checkpointImage.getNamespaceID() != 0) {
+ // If the image actually has some data, make sure we're talking
+ // to the same NN as we did before.
+ sig.validateStorageInfo(checkpointImage);
+ } else {
+ // if we're a fresh 2NN, just take the storage info from the server
+ // we first talk to.
+ dstStorage.setStorageInfo(sig);
+ dstStorage.setClusterID(sig.getClusterID());
+ dstStorage.setBlockPoolID(sig.getBlockpoolID());
+ }
// error simulation code for junit test
if (ErrorSimulator.getErrorSimulation(0)) {
@@ -443,14 +515,20 @@ boolean doCheckpoint() throws IOException {
"after creating edits.new");
}
- boolean loadImage = downloadCheckpointFiles(sig); // Fetch fsimage and edits
- doMerge(sig, loadImage); // Do the merge
-
+ RemoteEditLogManifest manifest =
+ namenode.getEditLogManifest(sig.mostRecentCheckpointTxId + 1);
+
+ boolean loadImage = downloadCheckpointFiles(
+ fsName, checkpointImage, sig, manifest); // Fetch fsimage and edits
+ doMerge(sig, manifest, loadImage, checkpointImage);
+
//
// Upload the new image into the NameNode. Then tell the Namenode
// to make this new uploaded image as the most current image.
//
- putFSImage(sig);
+ long txid = checkpointImage.getLastAppliedTxId();
+ TransferFsImage.uploadImageFromStorage(fsName, getImageListenAddress(),
+ dstStorage, txid);
// error simulation code for junit test
if (ErrorSimulator.getErrorSimulation(1)) {
@@ -458,91 +536,53 @@ boolean doCheckpoint() throws IOException {
"after uploading new image to NameNode");
}
- namenode.rollFsImage(sig);
- checkpointImage.endCheckpoint();
-
LOG.warn("Checkpoint done. New Image Size: "
- + checkpointImage.getStorage().getFsImageName().length());
+ + dstStorage.getFsImageName(txid).length());
+
+ // Since we've successfully checkpointed, we can remove some old
+ // image files
+ checkpointImage.purgeOldStorage();
return loadImage;
}
-
- private void startCheckpoint() throws IOException {
- checkpointImage.getStorage().unlockAll();
- checkpointImage.getEditLog().close();
- checkpointImage.recoverCreate(checkpointDirs, checkpointEditsDirs);
- checkpointImage.startCheckpoint();
- }
-
- /**
- * Merge downloaded image and edits and write the new image into
- * current storage directory.
- */
- private void doMerge(CheckpointSignature sig, boolean loadImage)
- throws IOException {
- if (loadImage) {
- namesystem = new FSNamesystem(checkpointImage, conf);
- }
- assert namesystem.dir.fsImage == checkpointImage;
- checkpointImage.doMerge(sig, loadImage);
- }
-
+
+
/**
* @param argv The parameters passed to this program.
* @exception Exception if the filesystem does not exist.
* @return 0 on success, non zero on error.
*/
- private int processArgs(String[] argv) throws Exception {
-
- if (argv.length < 1) {
- printUsage("");
- return -1;
+ private int processStartupCommand(CommandLineOpts opts) throws Exception {
+ if (opts.getCommand() == null) {
+ return 0;
}
-
- int exitCode = -1;
- int i = 0;
- String cmd = argv[i++];
-
- //
- // verify that we have enough command line parameters
- //
- if ("-geteditsize".equals(cmd)) {
- if (argv.length != 1) {
- printUsage(cmd);
- return exitCode;
- }
- } else if ("-checkpoint".equals(cmd)) {
- if (argv.length != 1 && argv.length != 2) {
- printUsage(cmd);
- return exitCode;
- }
- if (argv.length == 2 && !"force".equals(argv[i])) {
- printUsage(cmd);
- return exitCode;
- }
- }
-
- exitCode = 0;
+
+ String cmd = opts.getCommand().toString().toLowerCase();
+
+ int exitCode = 0;
try {
- if ("-checkpoint".equals(cmd)) {
- long size = namenode.getEditLogSize();
- if (size >= checkpointSize ||
- argv.length == 2 && "force".equals(argv[i])) {
+ switch (opts.getCommand()) {
+ case CHECKPOINT:
+ long count = countUncheckpointedTxns();
+ if (count > checkpointTxnCount ||
+ opts.shouldForceCheckpoint()) {
doCheckpoint();
} else {
- System.err.println("EditLog size " + size + " bytes is " +
+ System.err.println("EditLog size " + count + " transactions is " +
"smaller than configured checkpoint " +
- "size " + checkpointSize + " bytes.");
+ "interval " + checkpointTxnCount + " transactions.");
System.err.println("Skipping checkpoint.");
}
- } else if ("-geteditsize".equals(cmd)) {
- long size = namenode.getEditLogSize();
- System.out.println("EditLog size is " + size + " bytes");
- } else {
- exitCode = -1;
- LOG.error(cmd.substring(1) + ": Unknown command");
- printUsage("");
+ break;
+ case GETEDITSIZE:
+ long uncheckpointed = countUncheckpointedTxns();
+ System.out.println("NameNode has " + uncheckpointed +
+ " uncheckpointed transactions");
+ break;
+ default:
+ throw new AssertionError("bad command enum: " + opts.getCommand());
}
+
} catch (RemoteException e) {
//
// This is a error returned by hadoop server. Print
@@ -551,41 +591,32 @@ private int processArgs(String[] argv) throws Exception {
try {
String[] content;
content = e.getLocalizedMessage().split("\n");
- LOG.error(cmd.substring(1) + ": "
- + content[0]);
+ LOG.error(cmd + ": " + content[0]);
} catch (Exception ex) {
- LOG.error(cmd.substring(1) + ": "
- + ex.getLocalizedMessage());
+ LOG.error(cmd + ": " + ex.getLocalizedMessage());
}
} catch (IOException e) {
//
// IO exception encountered locally.
//
exitCode = -1;
- LOG.error(cmd.substring(1) + ": "
- + e.getLocalizedMessage());
+ LOG.error(cmd + ": " + e.getLocalizedMessage());
} finally {
// Does the RPC connection need to be closed?
}
return exitCode;
}
- /**
- * Displays format of commands.
- * @param cmd The command that is being executed.
- */
- private void printUsage(String cmd) {
- if ("-geteditsize".equals(cmd)) {
- System.err.println("Usage: java SecondaryNameNode"
- + " [-geteditsize]");
- } else if ("-checkpoint".equals(cmd)) {
- System.err.println("Usage: java SecondaryNameNode"
- + " [-checkpoint [force]]");
- } else {
- System.err.println("Usage: java SecondaryNameNode " +
- "[-checkpoint [force]] " +
- "[-geteditsize] ");
- }
+ private long countUncheckpointedTxns() throws IOException {
+ long curTxId = namenode.getTransactionID();
+ long uncheckpointedTxns = curTxId -
+ checkpointImage.getStorage().getMostRecentCheckpointTxId();
+ assert uncheckpointedTxns >= 0;
+ return uncheckpointedTxns;
+ }
+
+ boolean shouldCheckpointBasedOnCount() throws IOException {
+ return countUncheckpointedTxns() >= checkpointTxnCount;
}
/**
@@ -594,41 +625,151 @@ private void printUsage(String cmd) {
* @exception Exception if the filesystem does not exist.
*/
public static void main(String[] argv) throws Exception {
+ CommandLineOpts opts = SecondaryNameNode.parseArgs(argv);
+ if (opts == null) {
+ System.exit(-1);
+ }
+
StringUtils.startupShutdownMessage(SecondaryNameNode.class, argv, LOG);
Configuration tconf = new HdfsConfiguration();
- if (argv.length >= 1) {
- SecondaryNameNode secondary = new SecondaryNameNode(tconf);
- int ret = secondary.processArgs(argv);
+ SecondaryNameNode secondary = new SecondaryNameNode(tconf, opts);
+
+ if (opts.getCommand() != null) {
+ int ret = secondary.processStartupCommand(opts);
System.exit(ret);
}
// Create a never ending deamon
- Daemon checkpointThread = new Daemon(new SecondaryNameNode(tconf));
+ Daemon checkpointThread = new Daemon(secondary);
checkpointThread.start();
}
+
+
+ /**
+ * Container for parsed command-line options.
+ */
+ @SuppressWarnings("static-access")
+ static class CommandLineOpts {
+ private final Options options = new Options();
+
+ private final Option geteditsizeOpt;
+ private final Option checkpointOpt;
+ private final Option formatOpt;
+
+ Command cmd;
+ enum Command {
+ GETEDITSIZE,
+ CHECKPOINT;
+ }
+
+ private boolean shouldForce;
+ private boolean shouldFormat;
+
+ CommandLineOpts() {
+ geteditsizeOpt = new Option("geteditsize",
+ "return the number of uncheckpointed transactions on the NameNode");
+ checkpointOpt = OptionBuilder.withArgName("force")
+ .hasOptionalArg().withDescription("checkpoint on startup").create("checkpoint");;
+ formatOpt = new Option("format", "format the local storage during startup");
+
+ options.addOption(geteditsizeOpt);
+ options.addOption(checkpointOpt);
+ options.addOption(formatOpt);
+ }
+
+ public boolean shouldFormat() {
+ return shouldFormat;
+ }
+
+ public void parse(String ... argv) throws ParseException {
+ CommandLineParser parser = new PosixParser();
+ CommandLine cmdLine = parser.parse(options, argv);
+
+ boolean hasGetEdit = cmdLine.hasOption(geteditsizeOpt.getOpt());
+ boolean hasCheckpoint = cmdLine.hasOption(checkpointOpt.getOpt());
+ if (hasGetEdit && hasCheckpoint) {
+ throw new ParseException("May not pass both "
+ + geteditsizeOpt.getOpt() + " and "
+ + checkpointOpt.getOpt());
+ }
+
+ if (hasGetEdit) {
+ cmd = Command.GETEDITSIZE;
+ } else if (hasCheckpoint) {
+ cmd = Command.CHECKPOINT;
+
+ String arg = cmdLine.getOptionValue(checkpointOpt.getOpt());
+ if ("force".equals(arg)) {
+ shouldForce = true;
+ } else if (arg != null) {
+ throw new ParseException("-checkpoint may only take 'force' as an "
+ + "argument");
+ }
+ }
+
+ if (cmdLine.hasOption(formatOpt.getOpt())) {
+ shouldFormat = true;
+ }
+ }
+
+ public Command getCommand() {
+ return cmd;
+ }
+
+ public boolean shouldForceCheckpoint() {
+ return shouldForce;
+ }
+
+ void usage() {
+ HelpFormatter formatter = new HelpFormatter();
+ formatter.printHelp("secondarynamenode", options);
+ }
+ }
+
+ private static CommandLineOpts parseArgs(String[] argv) {
+ CommandLineOpts opts = new CommandLineOpts();
+ try {
+ opts.parse(argv);
+ } catch (ParseException pe) {
+ LOG.error(pe.getMessage());
+ opts.usage();
+ return null;
+ }
+ return opts;
+ }
+
static class CheckpointStorage extends FSImage {
/**
+ * Construct a checkpoint image.
+ * @param conf Node configuration.
+ * @param imageDirs URIs of storage for image.
+ * @param editDirs URIs of storage for edit logs.
+ * @throws IOException If storage cannot be access.
*/
- CheckpointStorage(Configuration conf) throws IOException {
- super(conf);
+ CheckpointStorage(Configuration conf,
+ Collection imageDirs,
+ Collection editsDirs) throws IOException {
+ super(conf, (FSNamesystem)null, imageDirs, editsDirs);
+ setFSNamesystem(new FSNamesystem(this, conf));
+
+ // the 2NN never writes edits -- it only downloads them. So
+ // we shouldn't have any editLog instance. Setting to null
+ // makes sure we don't accidentally depend on it.
+ editLog = null;
}
/**
* Analyze checkpoint directories.
* Create directories if they do not exist.
- * Recover from an unsuccessful checkpoint is necessary.
- *
- * @param dataDirs
- * @param editsDirs
+ * Recover from an unsuccessful checkpoint is necessary.
+ *
* @throws IOException
*/
- void recoverCreate(Collection dataDirs,
- Collection editsDirs) throws IOException {
- Collection tempDataDirs = new ArrayList(dataDirs);
- Collection tempEditsDirs = new ArrayList(editsDirs);
- storage.close();
- storage.setStorageDirectories(tempDataDirs, tempEditsDirs);
+ void recoverCreate(boolean format) throws IOException {
+ storage.attemptRestoreRemovedStorage();
+ storage.unlockAll();
+
for (Iterator it =
storage.dirIterator(); it.hasNext();) {
StorageDirectory sd = it.next();
@@ -643,6 +784,13 @@ void recoverCreate(Collection dataDirs,
if(!isAccessible)
throw new InconsistentFSStateException(sd.getRoot(),
"cannot access checkpoint directory.");
+
+ if (format) {
+ // Don't confirm, since this is just the secondary namenode.
+ LOG.info("Formatting storage directory " + sd);
+ sd.clearDirectory();
+ }
+
StorageState curState;
try {
curState = sd.analyzeStorage(HdfsConstants.StartupOption.REGULAR, storage);
@@ -655,6 +803,11 @@ void recoverCreate(Collection