From 20a076bafce548298729bab4fb81d12f829e8f7e Mon Sep 17 00:00:00 2001 From: Andrew Wang Date: Thu, 18 Sep 2014 17:35:24 -0700 Subject: [PATCH] HDFS-6970. Move startFile EDEK retries to the DFSClient. (wang) --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 + .../apache/hadoop/hdfs/DFSOutputStream.java | 64 ++++++--- .../hdfs/server/namenode/FSNamesystem.java | 128 ++++++++---------- .../namenode/RetryStartFileException.java | 17 ++- .../hadoop/hdfs/TestEncryptionZones.java | 2 +- 5 files changed, 122 insertions(+), 91 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index c99207b5150..c7339f080b9 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -562,6 +562,8 @@ Release 2.6.0 - UNRELEASED HDFS-6727. Refresh data volumes on DataNode based on configuration changes (Lei Xu via cmccabe) + HDFS-6970. Move startFile EDEK retries to the DFSClient. (wang) + OPTIMIZATIONS HDFS-6690. Deduplicate xattr names in memory. (wang) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java index d368f4efca5..94a1ddc0306 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSOutputStream.java @@ -41,6 +41,7 @@ import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicReference; +import com.google.common.base.Preconditions; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.crypto.CipherSuite; import org.apache.hadoop.fs.CanSetDropBehind; @@ -76,6 +77,7 @@ import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; import org.apache.hadoop.hdfs.server.namenode.NotReplicatedYetException; +import org.apache.hadoop.hdfs.server.namenode.RetryStartFileException; import org.apache.hadoop.hdfs.server.namenode.SafeModeException; import org.apache.hadoop.io.EnumSetWritable; import org.apache.hadoop.io.IOUtils; @@ -126,6 +128,13 @@ import com.google.common.cache.RemovalNotification; public class DFSOutputStream extends FSOutputSummer implements Syncable, CanSetDropBehind { private static final int MAX_PACKETS = 80; // each packet 64K, total 5MB + /** + * Number of times to retry creating a file when there are transient + * errors (typically related to encryption zones and KeyProvider operations). + */ + @VisibleForTesting + public static final int CREATE_RETRY_COUNT = 10; + private final DFSClient dfsClient; private final long dfsclientSlowLogThresholdMs; private Socket s; @@ -1648,23 +1657,46 @@ public class DFSOutputStream extends FSOutputSummer short replication, long blockSize, Progressable progress, int buffersize, DataChecksum checksum, String[] favoredNodes, List cipherSuites) throws IOException { - final HdfsFileStatus stat; - try { - stat = dfsClient.namenode.create(src, masked, dfsClient.clientName, - new EnumSetWritable(flag), createParent, replication, - blockSize, cipherSuites); - } catch(RemoteException re) { - throw re.unwrapRemoteException(AccessControlException.class, - DSQuotaExceededException.class, - FileAlreadyExistsException.class, - FileNotFoundException.class, - ParentNotDirectoryException.class, - NSQuotaExceededException.class, - SafeModeException.class, - UnresolvedPathException.class, - SnapshotAccessControlException.class, - UnknownCipherSuiteException.class); + HdfsFileStatus stat = null; + + // Retry the create if we get a RetryStartFileException up to a maximum + // number of times + boolean shouldRetry = true; + int retryCount = CREATE_RETRY_COUNT; + while (shouldRetry) { + shouldRetry = false; + try { + stat = dfsClient.namenode.create(src, masked, dfsClient.clientName, + new EnumSetWritable(flag), createParent, replication, + blockSize, cipherSuites); + break; + } catch (RemoteException re) { + IOException e = re.unwrapRemoteException( + AccessControlException.class, + DSQuotaExceededException.class, + FileAlreadyExistsException.class, + FileNotFoundException.class, + ParentNotDirectoryException.class, + NSQuotaExceededException.class, + RetryStartFileException.class, + SafeModeException.class, + UnresolvedPathException.class, + SnapshotAccessControlException.class, + UnknownCipherSuiteException.class); + if (e instanceof RetryStartFileException) { + if (retryCount > 0) { + shouldRetry = true; + retryCount--; + } else { + throw new IOException("Too many retries because of encryption" + + " zone operations", e); + } + } else { + throw e; + } + } } + Preconditions.checkNotNull(stat, "HdfsFileStatus should not be null!"); final DFSOutputStream out = new DFSOutputStream(dfsClient, src, stat, flag, progress, checksum, favoredNodes); out.start(); diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index fa7f34f9cb1..d0a1af65add 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -2490,84 +2490,66 @@ public class FSNamesystem implements Namesystem, FSClusterStats, waitForLoadingFSImage(); - /* - * We want to avoid holding any locks while doing KeyProvider operations, - * since they can be very slow. Since the path can - * flip flop between being in an encryption zone and not in the meantime, - * we need to recheck the preconditions and redo KeyProvider operations - * in some situations. - * - * A special RetryStartFileException is used to indicate that we should - * retry creation of a FileEncryptionInfo. + /** + * If the file is in an encryption zone, we optimistically create an + * EDEK for the file by calling out to the configured KeyProvider. + * Since this typically involves doing an RPC, we take the readLock + * initially, then drop it to do the RPC. + * + * Since the path can flip-flop between being in an encryption zone and not + * in the meantime, we need to recheck the preconditions when we retake the + * lock to do the create. If the preconditions are not met, we throw a + * special RetryStartFileException to ask the DFSClient to try the create + * again later. */ - BlocksMapUpdateInfo toRemoveBlocks = null; + CipherSuite suite = null; + String ezKeyName = null; + readLock(); try { - boolean shouldContinue = true; - int iters = 0; - while (shouldContinue) { - skipSync = false; - if (iters >= 10) { - throw new IOException("Too many retries because of encryption zone " + - "operations, something might be broken!"); - } - shouldContinue = false; - iters++; - - // Optimistically determine CipherSuite and ezKeyName if the path is - // currently within an encryption zone - CipherSuite suite = null; - String ezKeyName = null; - readLock(); - try { - src = resolvePath(src, pathComponents); - INodesInPath iip = dir.getINodesInPath4Write(src); - // Nothing to do if the path is not within an EZ - if (dir.isInAnEZ(iip)) { - suite = chooseCipherSuite(iip, cipherSuites); - if (suite != null) { - Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN), - "Chose an UNKNOWN CipherSuite!"); - } - ezKeyName = dir.getKeyName(iip); - Preconditions.checkState(ezKeyName != null); - } - } finally { - readUnlock(); - } - - Preconditions.checkState( - (suite == null && ezKeyName == null) || - (suite != null && ezKeyName != null), - "Both suite and ezKeyName should both be null or not null"); - // Generate EDEK if necessary while not holding the lock - EncryptedKeyVersion edek = - generateEncryptedDataEncryptionKey(ezKeyName); - EncryptionFaultInjector.getInstance().startFileAfterGenerateKey(); - // Try to create the file with the computed cipher suite and EDEK - writeLock(); - try { - checkOperation(OperationCategory.WRITE); - checkNameNodeSafeMode("Cannot create file" + src); - src = resolvePath(src, pathComponents); - toRemoveBlocks = startFileInternal(pc, src, permissions, holder, - clientMachine, create, overwrite, createParent, replication, - blockSize, suite, edek, logRetryCache); - stat = dir.getFileInfo(src, false, - FSDirectory.isReservedRawName(srcArg), false); - } catch (StandbyException se) { - skipSync = true; - throw se; - } catch (RetryStartFileException e) { - shouldContinue = true; - if (LOG.isTraceEnabled()) { - LOG.trace("Preconditions failed, retrying creation of " + - "FileEncryptionInfo", e); - } - } finally { - writeUnlock(); + src = resolvePath(src, pathComponents); + INodesInPath iip = dir.getINodesInPath4Write(src); + // Nothing to do if the path is not within an EZ + if (dir.isInAnEZ(iip)) { + suite = chooseCipherSuite(iip, cipherSuites); + if (suite != null) { + Preconditions.checkArgument(!suite.equals(CipherSuite.UNKNOWN), + "Chose an UNKNOWN CipherSuite!"); } + ezKeyName = dir.getKeyName(iip); + Preconditions.checkState(ezKeyName != null); } } finally { + readUnlock(); + } + + Preconditions.checkState( + (suite == null && ezKeyName == null) || + (suite != null && ezKeyName != null), + "Both suite and ezKeyName should both be null or not null"); + + // Generate EDEK if necessary while not holding the lock + EncryptedKeyVersion edek = + generateEncryptedDataEncryptionKey(ezKeyName); + EncryptionFaultInjector.getInstance().startFileAfterGenerateKey(); + + // Proceed with the create, using the computed cipher suite and + // generated EDEK + BlocksMapUpdateInfo toRemoveBlocks = null; + writeLock(); + try { + checkOperation(OperationCategory.WRITE); + checkNameNodeSafeMode("Cannot create file" + src); + src = resolvePath(src, pathComponents); + toRemoveBlocks = startFileInternal(pc, src, permissions, holder, + clientMachine, create, overwrite, createParent, replication, + blockSize, suite, edek, logRetryCache); + stat = dir.getFileInfo(src, false, + FSDirectory.isReservedRawName(srcArg), false); + } catch (StandbyException se) { + skipSync = true; + throw se; + } finally { + writeUnlock(); // There might be transactions logged while trying to recover the lease. // They need to be sync'ed even when an exception was thrown. if (!skipSync) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/RetryStartFileException.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/RetryStartFileException.java index a5758a7e0e4..0bdd2a58217 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/RetryStartFileException.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/RetryStartFileException.java @@ -17,5 +17,20 @@ */ package org.apache.hadoop.hdfs.server.namenode; -public class RetryStartFileException extends Exception { +import java.io.IOException; + +import org.apache.hadoop.classification.InterfaceAudience; + +@InterfaceAudience.Private +public class RetryStartFileException extends IOException { + private static final long serialVersionUID = 1L; + + public RetryStartFileException() { + super("Preconditions for creating a file failed because of a " + + "transient error, retry create later."); + } + + public RetryStartFileException(String s) { + super(s); + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java index ff2820072cc..52ca942ce34 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestEncryptionZones.java @@ -940,7 +940,7 @@ public class TestEncryptionZones { Future future = executor.submit(new CreateFileTask(fsWrapper, file)); // Flip-flop between two EZs to repeatedly fail - for (int i=0; i<10; i++) { + for (int i=0; i