HDFS-9533. seen_txid in the shared edits directory is modified during bootstrapping. Contributed by Kihwal Lee.

(cherry picked from commit 5cb1e0118b)

Conflicts:
	hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/BootstrapStandby.java
	hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestBootstrapStandby.java
(cherry picked from commit 3c0ae5914c)
This commit is contained in:
Kihwal Lee 2015-12-17 17:28:24 -06:00
parent 56882e7b1c
commit 03292b234e
5 changed files with 43 additions and 3 deletions

View File

@ -36,6 +36,9 @@ Release 2.7.3 - UNRELEASED
HDFS-9516. Truncate file fails with data dirs on multiple disks. HDFS-9516. Truncate file fails with data dirs on multiple disks.
(Plamen Jeliazkov via shv) (Plamen Jeliazkov via shv)
HDFS-9533. seen_txid in the shared edits directory is modified during
bootstrapping (kihwal)
Release 2.7.2 - UNRELEASED Release 2.7.2 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -473,8 +473,24 @@ public class NNStorage extends Storage implements Closeable,
* @param txid the txid that has been reached * @param txid the txid that has been reached
*/ */
public void writeTransactionIdFileToStorage(long txid) { public void writeTransactionIdFileToStorage(long txid) {
writeTransactionIdFileToStorage(txid, null);
}
/**
* Write a small file in all available storage directories that
* indicates that the namespace has reached some given transaction ID.
*
* This is used when the image is loaded to avoid accidental rollbacks
* in the case where an edit log is fully deleted but there is no
* checkpoint. See TestNameEditsConfigs.testNameEditsConfigsFailure()
* @param txid the txid that has been reached
* @param type the type of directory
*/
public void writeTransactionIdFileToStorage(long txid,
NameNodeDirType type) {
// Write txid marker in all storage directories // Write txid marker in all storage directories
for (StorageDirectory sd : storageDirs) { for (Iterator<StorageDirectory> it = dirIterator(type); it.hasNext();) {
StorageDirectory sd = it.next();
try { try {
writeTransactionIdFile(sd, txid); writeTransactionIdFile(sd, txid);
} catch(IOException e) { } catch(IOException e) {

View File

@ -51,6 +51,7 @@ import org.apache.hadoop.hdfs.server.namenode.FSImage;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem; import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.NNStorage; import org.apache.hadoop.hdfs.server.namenode.NNStorage;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile; import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.hdfs.server.namenode.NNUpgradeUtil; import org.apache.hadoop.hdfs.server.namenode.NNUpgradeUtil;
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage; import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
@ -312,13 +313,14 @@ public class BootstrapStandby implements Tool, Configurable {
return ERR_CODE_LOGS_UNAVAILABLE; return ERR_CODE_LOGS_UNAVAILABLE;
} }
image.getStorage().writeTransactionIdFileToStorage(curTxId);
// Download that checkpoint into our storage directories. // Download that checkpoint into our storage directories.
MD5Hash hash = TransferFsImage.downloadImageToStorage( MD5Hash hash = TransferFsImage.downloadImageToStorage(
otherHttpAddr, imageTxId, storage, true); otherHttpAddr, imageTxId, storage, true);
image.saveDigestAndRenameCheckpointImage(NameNodeFile.IMAGE, imageTxId, image.saveDigestAndRenameCheckpointImage(NameNodeFile.IMAGE, imageTxId,
hash); hash);
// Write seen_txid to the formatted image directories.
storage.writeTransactionIdFileToStorage(imageTxId, NameNodeDirType.IMAGE);
} catch (IOException ioe) { } catch (IOException ioe) {
image.close(); image.close();
throw ioe; throw ioe;

View File

@ -568,4 +568,11 @@ public abstract class FSImageTestUtil {
FSImageTestUtil.assertParallelFilesAreIdentical(curDirs, FSImageTestUtil.assertParallelFilesAreIdentical(curDirs,
ignoredFiles); ignoredFiles);
} }
public static long getStorageTxId(NameNode node, URI storageUri)
throws IOException {
StorageDirectory sDir = getFSImage(node).getStorage().
getStorageDirectory(storageUri);
return NNStorage.readTransactionIdFile(sDir);
}
} }

View File

@ -29,6 +29,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.MiniDFSNNTopology; import org.apache.hadoop.hdfs.MiniDFSNNTopology;
import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature; import org.apache.hadoop.hdfs.server.namenode.CheckpointSignature;
@ -126,6 +127,13 @@ public class TestBootstrapStandby {
.getFSImage().getMostRecentCheckpointTxId(); .getFSImage().getMostRecentCheckpointTxId();
assertEquals(6, expectedCheckpointTxId); assertEquals(6, expectedCheckpointTxId);
// advance the current txid
cluster.getFileSystem(0).create(new Path("/test_txid"), (short)1).close();
// obtain the content of seen_txid
URI editsUri = cluster.getSharedEditsDir(0, 1);
long seen_txid_shared = FSImageTestUtil.getStorageTxId(nn0, editsUri);
int rc = BootstrapStandby.run( int rc = BootstrapStandby.run(
new String[]{"-force"}, new String[]{"-force"},
cluster.getConfiguration(1)); cluster.getConfiguration(1));
@ -136,6 +144,10 @@ public class TestBootstrapStandby {
ImmutableList.of((int)expectedCheckpointTxId)); ImmutableList.of((int)expectedCheckpointTxId));
FSImageTestUtil.assertNNFilesMatch(cluster); FSImageTestUtil.assertNNFilesMatch(cluster);
// Make sure the seen_txid was not modified by the standby
assertEquals(seen_txid_shared,
FSImageTestUtil.getStorageTxId(nn0, editsUri));
// We should now be able to start the standby successfully. // We should now be able to start the standby successfully.
cluster.restartNameNode(1); cluster.restartNameNode(1);
} }