Merge trunk to HDFS-4685.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-4685@1567994 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris Nauroth 2014-02-13 18:21:46 +00:00
commit a49dee06fb
81 changed files with 4391 additions and 2410 deletions

View File

@ -334,6 +334,9 @@ Release 2.4.0 - UNRELEASED
HADOOP-10326. M/R jobs can not access S3 if Kerberos is enabled. (bc Wong
via atm)
HADOOP-10338. Cannot get the FileStatus of the root inode from the new
Globber (cmccabe)
Release 2.3.1 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -114,7 +114,8 @@ private String schemeFromPath(Path path) throws IOException {
if (fs != null) {
scheme = fs.getUri().getScheme();
} else {
scheme = fc.getDefaultFileSystem().getUri().getScheme();
scheme = fc.getFSofPath(fc.fixRelativePart(path)).
getUri().getScheme();
}
}
return scheme;
@ -126,7 +127,8 @@ private String authorityFromPath(Path path) throws IOException {
if (fs != null) {
authority = fs.getUri().getAuthority();
} else {
authority = fc.getDefaultFileSystem().getUri().getAuthority();
authority = fc.getFSofPath(fc.fixRelativePart(path)).
getUri().getAuthority();
}
}
return authority ;
@ -162,18 +164,26 @@ public FileStatus[] glob() throws IOException {
// Starting out at the root of the filesystem, we try to match
// filesystem entries against pattern components.
ArrayList<FileStatus> candidates = new ArrayList<FileStatus>(1);
// To get the "real" FileStatus of root, we'd have to do an expensive
// RPC to the NameNode. So we create a placeholder FileStatus which has
// the correct path, but defaults for the rest of the information.
// Later, if it turns out we actually want the FileStatus of root, we'll
// replace the placeholder with a real FileStatus obtained from the
// NameNode.
FileStatus rootPlaceholder;
if (Path.WINDOWS && !components.isEmpty()
&& Path.isWindowsAbsolutePath(absPattern.toUri().getPath(), true)) {
// On Windows the path could begin with a drive letter, e.g. /E:/foo.
// We will skip matching the drive letter and start from listing the
// root of the filesystem on that drive.
String driveLetter = components.remove(0);
candidates.add(new FileStatus(0, true, 0, 0, 0, new Path(scheme,
authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR)));
rootPlaceholder = new FileStatus(0, true, 0, 0, 0, new Path(scheme,
authority, Path.SEPARATOR + driveLetter + Path.SEPARATOR));
} else {
candidates.add(new FileStatus(0, true, 0, 0, 0,
new Path(scheme, authority, Path.SEPARATOR)));
rootPlaceholder = new FileStatus(0, true, 0, 0, 0,
new Path(scheme, authority, Path.SEPARATOR));
}
candidates.add(rootPlaceholder);
for (int componentIdx = 0; componentIdx < components.size();
componentIdx++) {
@ -245,6 +255,12 @@ public FileStatus[] glob() throws IOException {
candidates = newCandidates;
}
for (FileStatus status : candidates) {
// Use object equality to see if this status is the root placeholder.
// See the explanation for rootPlaceholder above for more information.
if (status == rootPlaceholder) {
status = getFileStatus(rootPlaceholder.getPath());
if (status == null) continue;
}
// HADOOP-3497 semantics: the user-defined filter is applied at the
// end, once the full path is built up.
if (filter.accept(status.getPath())) {

View File

@ -220,7 +220,7 @@ final public static void setPingInterval(Configuration conf, int pingInterval) {
* @param conf Configuration
* @return the ping interval
*/
final static int getPingInterval(Configuration conf) {
final public static int getPingInterval(Configuration conf) {
return conf.getInt(CommonConfigurationKeys.IPC_PING_INTERVAL_KEY,
CommonConfigurationKeys.IPC_PING_INTERVAL_DEFAULT);
}

View File

@ -37,6 +37,7 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.NativeCodeLoader;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.Uninterruptibles;
@ -48,7 +49,7 @@
* See {@link DomainSocket} for more information about UNIX domain sockets.
*/
@InterfaceAudience.LimitedPrivate("HDFS")
public final class DomainSocketWatcher extends Thread implements Closeable {
public final class DomainSocketWatcher implements Closeable {
static {
if (SystemUtils.IS_OS_WINDOWS) {
loadingFailureReason = "UNIX Domain sockets are not available on Windows.";
@ -281,7 +282,7 @@ public void add(DomainSocket sock, Handler handler) {
try {
processedCond.await();
} catch (InterruptedException e) {
this.interrupt();
Thread.currentThread().interrupt();
}
if (!toAdd.contains(entry)) {
break;
@ -308,7 +309,7 @@ public void remove(DomainSocket sock) {
try {
processedCond.await();
} catch (InterruptedException e) {
this.interrupt();
Thread.currentThread().interrupt();
}
if (!toRemove.containsKey(sock.fd)) {
break;
@ -381,7 +382,8 @@ private void sendCallback(String caller, TreeMap<Integer, Entry> entries,
}
}
private final Thread watcherThread = new Thread(new Runnable() {
@VisibleForTesting
final Thread watcherThread = new Thread(new Runnable() {
@Override
public void run() {
LOG.info(this + ": starting with interruptCheckPeriodMs = " +
@ -443,6 +445,7 @@ public void run() {
} catch (IOException e) {
LOG.error(toString() + " terminating on IOException", e);
} finally {
kick(); // allow the handler for notificationSockets[0] to read a byte
for (Entry entry : entries.values()) {
sendCallback("close", entries, fdSet, entry.getDomainSocket().fd);
}

View File

@ -0,0 +1,53 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.util;
import java.util.concurrent.locks.Condition;
/**
* Represents an object that you can wait for.
*/
public class Waitable<T> {
private T val;
private final Condition cond;
public Waitable(Condition cond) {
this.val = null;
this.cond = cond;
}
public T await() throws InterruptedException {
while (this.val == null) {
this.cond.await();
}
return this.val;
}
public void provide(T val) {
this.val = val;
this.cond.signalAll();
}
public boolean hasVal() {
return this.val != null;
}
public T getVal() {
return this.val;
}
}

View File

@ -73,9 +73,10 @@ public boolean handle(DomainSocket sock) {
*/
@Test(timeout=60000)
public void testInterruption() throws Exception {
DomainSocketWatcher watcher = new DomainSocketWatcher(10);
watcher.interrupt();
Uninterruptibles.joinUninterruptibly(watcher);
final DomainSocketWatcher watcher = new DomainSocketWatcher(10);
watcher.watcherThread.interrupt();
Uninterruptibles.joinUninterruptibly(watcher.watcherThread);
watcher.close();
}
@Test(timeout=300000)

View File

@ -338,6 +338,8 @@ Trunk (Unreleased)
HDFS-5915. Refactor FSImageFormatProtobuf to simplify cross section reads.
(Haohui Mai via cnauroth)
HDFS-5847. Consolidate INodeReference into a separate section. (jing9)
Release 2.4.0 - UNRELEASED
INCOMPATIBLE CHANGES
@ -365,6 +367,15 @@ Release 2.4.0 - UNRELEASED
HDFS-4370. Fix typo Blanacer in DataNode. (Chu Tong via shv)
HDFS-5929. Add blockpool % usage to HDFS federated nn page.
(Siqi Li via suresh)
HDFS-5810. Unify mmap cache and short-circuit file descriptor cache
(cmccabe)
HDFS-5940. Minor cleanups to ShortCircuitReplica, FsDatasetCache, and
DomainSocketWatcher (cmccabe)
OPTIMIZATIONS
HDFS-5790. LeaseManager.findPath is very slow when many leases need recovery
@ -410,6 +421,20 @@ Release 2.4.0 - UNRELEASED
HDFS-5886. Potential null pointer deference in RpcProgramNfs3#readlink()
(brandonli)
HDFS-4858. HDFS DataNode to NameNode RPC should timeout.
(Henry Wang via shv)
HDFS-5879. Some TestHftpFileSystem tests do not close streams.
(Gera Shegalov via suresh)
HDFS-5938. Make BlockReaderFactory#BlockReaderPeer a static class to avoid
a findbugs warning. (cmccabe)
HDFS-5891. webhdfs should not try connecting the DN during redirection
(Haohui Mai via brandonli)
HDFS-5904. TestFileStatus fails intermittently. (Mit Desai via kihwal)
Release 2.3.1 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -23,7 +23,6 @@
import org.apache.hadoop.fs.ByteBufferReadable;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
/**
@ -97,6 +96,5 @@ public interface BlockReader extends ByteBufferReadable {
* @return The ClientMmap object, or null if mmap is not
* supported.
*/
ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager);
ClientMmap getClientMmap(EnumSet<ReadOption> opts);
}

View File

@ -24,217 +24,748 @@
import java.io.IOException;
import java.net.InetSocketAddress;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitCache.ShortCircuitReplicaCreator;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.client.ShortCircuitReplicaInfo;
import org.apache.hadoop.hdfs.net.DomainPeer;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.Time;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
/**
* Utility class to create BlockReader implementations.
*/
@InterfaceAudience.Private
public class BlockReaderFactory {
/**
* Create a new BlockReader specifically to satisfy a read.
* This method also sends the OP_READ_BLOCK request.
*
* @param conf the DFSClient configuration
* @param file File location
* @param block The block object
* @param blockToken The block token for security
* @param startOffset The read offset, relative to block head
* @param len The number of bytes to read, or -1 to read as many as
* possible.
* @param bufferSize The IO buffer size (not the client buffer size)
* Ignored except on the legacy BlockReader.
* @param verifyChecksum Whether to verify checksum
* @param clientName Client name. Used for log messages.
* @param peer The peer
* @param datanodeID The datanode that the Peer is connected to
* @param domainSocketFactory The DomainSocketFactory to notify if the Peer
* is a DomainPeer which turns out to be faulty.
* If null, no factory will be notified in this
* case.
* @param allowShortCircuitLocalReads True if short-circuit local reads
* should be allowed.
* @return New BlockReader instance
*/
public static BlockReader newBlockReader(DFSClient.Conf conf,
String file,
ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken,
long startOffset, long len,
boolean verifyChecksum,
String clientName,
Peer peer,
DatanodeID datanodeID,
DomainSocketFactory domSockFactory,
PeerCache peerCache,
FileInputStreamCache fisCache,
boolean allowShortCircuitLocalReads,
CachingStrategy cachingStrategy)
throws IOException {
peer.setReadTimeout(conf.socketTimeout);
peer.setWriteTimeout(HdfsServerConstants.WRITE_TIMEOUT);
public class BlockReaderFactory implements ShortCircuitReplicaCreator {
static final Log LOG = LogFactory.getLog(BlockReaderFactory.class);
if (peer.getDomainSocket() != null) {
if (allowShortCircuitLocalReads && !conf.useLegacyBlockReaderLocal) {
// If this is a domain socket, and short-circuit local reads are
// enabled, try to set up a BlockReaderLocal.
BlockReader reader = newShortCircuitBlockReader(conf, file,
block, blockToken, startOffset, len, peer, datanodeID,
domSockFactory, verifyChecksum, fisCache, cachingStrategy);
@VisibleForTesting
static ShortCircuitReplicaCreator
createShortCircuitReplicaInfoCallback = null;
private final DFSClient.Conf conf;
/**
* The file name, for logging and debugging purposes.
*/
private String fileName;
/**
* The block ID and block pool ID to use.
*/
private ExtendedBlock block;
/**
* The block token to use for security purposes.
*/
private Token<BlockTokenIdentifier> token;
/**
* The offset within the block to start reading at.
*/
private long startOffset;
/**
* If false, we won't try to verify the block checksum.
*/
private boolean verifyChecksum;
/**
* The name of this client.
*/
private String clientName;
/**
* The DataNode we're talking to.
*/
private DatanodeInfo datanode;
/**
* If false, we won't try short-circuit local reads.
*/
private boolean allowShortCircuitLocalReads;
/**
* The ClientContext to use for things like the PeerCache.
*/
private ClientContext clientContext;
/**
* Number of bytes to read. -1 indicates no limit.
*/
private long length = -1;
/**
* Caching strategy to use when reading the block.
*/
private CachingStrategy cachingStrategy;
/**
* Socket address to use to connect to peer.
*/
private InetSocketAddress inetSocketAddress;
/**
* Remote peer factory to use to create a peer, if needed.
*/
private RemotePeerFactory remotePeerFactory;
/**
* UserGroupInformation to use for legacy block reader local objects, if needed.
*/
private UserGroupInformation userGroupInformation;
/**
* Configuration to use for legacy block reader local objects, if needed.
*/
private Configuration configuration;
/**
* Information about the domain socket path we should use to connect to the
* local peer-- or null if we haven't examined the local domain socket.
*/
private DomainSocketFactory.PathInfo pathInfo;
/**
* The remaining number of times that we'll try to pull a socket out of the
* cache.
*/
private int remainingCacheTries;
public BlockReaderFactory(DFSClient.Conf conf) {
this.conf = conf;
this.remainingCacheTries = conf.nCachedConnRetry;
}
public BlockReaderFactory setFileName(String fileName) {
this.fileName = fileName;
return this;
}
public BlockReaderFactory setBlock(ExtendedBlock block) {
this.block = block;
return this;
}
public BlockReaderFactory setBlockToken(Token<BlockTokenIdentifier> token) {
this.token = token;
return this;
}
public BlockReaderFactory setStartOffset(long startOffset) {
this.startOffset = startOffset;
return this;
}
public BlockReaderFactory setVerifyChecksum(boolean verifyChecksum) {
this.verifyChecksum = verifyChecksum;
return this;
}
public BlockReaderFactory setClientName(String clientName) {
this.clientName = clientName;
return this;
}
public BlockReaderFactory setDatanodeInfo(DatanodeInfo datanode) {
this.datanode = datanode;
return this;
}
public BlockReaderFactory setAllowShortCircuitLocalReads(
boolean allowShortCircuitLocalReads) {
this.allowShortCircuitLocalReads = allowShortCircuitLocalReads;
return this;
}
public BlockReaderFactory setClientCacheContext(
ClientContext clientContext) {
this.clientContext = clientContext;
return this;
}
public BlockReaderFactory setLength(long length) {
this.length = length;
return this;
}
public BlockReaderFactory setCachingStrategy(
CachingStrategy cachingStrategy) {
this.cachingStrategy = cachingStrategy;
return this;
}
public BlockReaderFactory setInetSocketAddress (
InetSocketAddress inetSocketAddress) {
this.inetSocketAddress = inetSocketAddress;
return this;
}
public BlockReaderFactory setUserGroupInformation(
UserGroupInformation userGroupInformation) {
this.userGroupInformation = userGroupInformation;
return this;
}
public BlockReaderFactory setRemotePeerFactory(
RemotePeerFactory remotePeerFactory) {
this.remotePeerFactory = remotePeerFactory;
return this;
}
public BlockReaderFactory setConfiguration(
Configuration configuration) {
this.configuration = configuration;
return this;
}
/**
* Build a BlockReader with the given options.
*
* This function will do the best it can to create a block reader that meets
* all of our requirements. We prefer short-circuit block readers
* (BlockReaderLocal and BlockReaderLocalLegacy) over remote ones, since the
* former avoid the overhead of socket communication. If short-circuit is
* unavailable, our next fallback is data transfer over UNIX domain sockets,
* if dfs.client.domain.socket.data.traffic has been enabled. If that doesn't
* work, we will try to create a remote block reader that operates over TCP
* sockets.
*
* There are a few caches that are important here.
*
* The ShortCircuitCache stores file descriptor objects which have been passed
* from the DataNode.
*
* The DomainSocketFactory stores information about UNIX domain socket paths
* that we not been able to use in the past, so that we don't waste time
* retrying them over and over. (Like all the caches, it does have a timeout,
* though.)
*
* The PeerCache stores peers that we have used in the past. If we can reuse
* one of these peers, we avoid the overhead of re-opening a socket. However,
* if the socket has been timed out on the remote end, our attempt to reuse
* the socket may end with an IOException. For that reason, we limit our
* attempts at socket reuse to dfs.client.cached.conn.retry times. After
* that, we create new sockets. This avoids the problem where a thread tries
* to talk to a peer that it hasn't talked to in a while, and has to clean out
* every entry in a socket cache full of stale entries.
*
* @return The new BlockReader. We will not return null.
*
* @throws InvalidToken
* If the block token was invalid.
* InvalidEncryptionKeyException
* If the encryption key was invalid.
* Other IOException
* If there was another problem.
*/
public BlockReader build() throws IOException {
BlockReader reader = null;
Preconditions.checkNotNull(configuration);
if (conf.shortCircuitLocalReads && allowShortCircuitLocalReads) {
if (clientContext.getUseLegacyBlockReaderLocal()) {
reader = getLegacyBlockReaderLocal();
if (reader != null) {
// One we've constructed the short-circuit block reader, we don't
// need the socket any more. So let's return it to the cache.
if (peerCache != null) {
peerCache.put(datanodeID, peer);
} else {
IOUtils.cleanup(null, peer);
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": returning new legacy block reader local.");
}
return reader;
}
} else {
reader = getBlockReaderLocal();
if (reader != null) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": returning new block reader local.");
}
return reader;
}
}
// If this is a domain socket and we couldn't (or didn't want to) set
// up a BlockReaderLocal, check that we are allowed to pass data traffic
// over the socket before proceeding.
if (!conf.domainSocketDataTraffic) {
throw new IOException("Because we can't do short-circuit access, " +
"and data traffic over domain sockets is disabled, " +
"we cannot use this socket to talk to " + datanodeID);
}
if (conf.domainSocketDataTraffic) {
reader = getRemoteBlockReaderFromDomain();
if (reader != null) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": returning new remote block reader using " +
"UNIX domain socket on " + pathInfo.getPath());
}
return reader;
}
}
if (conf.useLegacyBlockReader) {
@SuppressWarnings("deprecation")
RemoteBlockReader reader = RemoteBlockReader.newBlockReader(file,
block, blockToken, startOffset, len, conf.ioBufferSize,
verifyChecksum, clientName, peer, datanodeID, peerCache,
cachingStrategy);
return reader;
} else {
return RemoteBlockReader2.newBlockReader(
file, block, blockToken, startOffset, len,
verifyChecksum, clientName, peer, datanodeID, peerCache,
cachingStrategy);
}
Preconditions.checkState(!DFSInputStream.tcpReadsDisabledForTesting,
"TCP reads were disabled for testing, but we failed to " +
"do a non-TCP read.");
return getRemoteBlockReaderFromTcp();
}
/**
* Create a new short-circuit BlockReader.
*
* Here, we ask the DataNode to pass us file descriptors over our
* DomainSocket. If the DataNode declines to do so, we'll return null here;
* otherwise, we'll return the BlockReaderLocal. If the DataNode declines,
* this function will inform the DomainSocketFactory that short-circuit local
* reads are disabled for this DataNode, so that we don't ask again.
*
* @param conf the configuration.
* @param file the file name. Used in log messages.
* @param block The block object.
* @param blockToken The block token for security.
* @param startOffset The read offset, relative to block head.
* @param len The number of bytes to read, or -1 to read
* as many as possible.
* @param peer The peer to use.
* @param datanodeID The datanode that the Peer is connected to.
* @param domSockFactory The DomainSocketFactory to notify if the Peer
* is a DomainPeer which turns out to be faulty.
* If null, no factory will be notified in this
* case.
* @param verifyChecksum True if we should verify the checksums.
* Note: even if this is true, when
* DFS_CLIENT_READ_CHECKSUM_SKIP_CHECKSUM_KEY is
* set or the block is mlocked, we will skip
* checksums.
*
* @return The BlockReaderLocal, or null if the
* DataNode declined to provide short-circuit
* access.
* @throws IOException If there was a communication error.
* Get {@link BlockReaderLocalLegacy} for short circuited local reads.
* This block reader implements the path-based style of local reads
* first introduced in HDFS-2246.
*/
private static BlockReaderLocal newShortCircuitBlockReader(
DFSClient.Conf conf, String file, ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken, long startOffset,
long len, Peer peer, DatanodeID datanodeID,
DomainSocketFactory domSockFactory, boolean verifyChecksum,
FileInputStreamCache fisCache,
CachingStrategy cachingStrategy) throws IOException {
private BlockReader getLegacyBlockReaderLocal() throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": trying to construct BlockReaderLocalLegacy");
}
if (!DFSClient.isLocalAddress(inetSocketAddress)) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
"the address " + inetSocketAddress + " is not local");
}
return null;
}
if (clientContext.getDisableLegacyBlockReaderLocal()) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": can't construct BlockReaderLocalLegacy because " +
"disableLegacyBlockReaderLocal is set.");
}
return null;
}
IOException ioe = null;
try {
return BlockReaderLocalLegacy.newBlockReader(conf,
userGroupInformation, configuration, fileName, block, token,
datanode, startOffset, length);
} catch (RemoteException remoteException) {
ioe = remoteException.unwrapRemoteException(
InvalidToken.class, AccessControlException.class);
} catch (IOException e) {
ioe = e;
}
if ((!(ioe instanceof AccessControlException)) &&
isSecurityException(ioe)) {
// Handle security exceptions.
// We do not handle AccessControlException here, since
// BlockReaderLocalLegacy#newBlockReader uses that exception to indicate
// that the user is not in dfs.block.local-path-access.user, a condition
// which requires us to disable legacy SCR.
throw ioe;
}
LOG.warn(this + ": error creating legacy BlockReaderLocal. " +
"Disabling legacy local reads.", ioe);
clientContext.setDisableLegacyBlockReaderLocal();
return null;
}
private BlockReader getBlockReaderLocal() throws InvalidToken {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": trying to construct a BlockReaderLocal " +
"for short-circuit reads.");
}
if (pathInfo == null) {
pathInfo = clientContext.getDomainSocketFactory().
getPathInfo(inetSocketAddress, conf);
}
if (!pathInfo.getPathState().getUsableForShortCircuit()) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": " + pathInfo + " is not " +
"usable for short circuit; giving up on BlockReaderLocal.");
}
return null;
}
ShortCircuitCache cache = clientContext.getShortCircuitCache();
ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
ShortCircuitReplicaInfo info = cache.fetchOrCreate(key, this);
InvalidToken exc = info.getInvalidTokenException();
if (exc != null) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": got InvalidToken exception while trying to " +
"construct BlockReaderLocal via " + pathInfo.getPath());
}
throw exc;
}
if (info.getReplica() == null) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": failed to get ShortCircuitReplica. " +
"Cannot construct BlockReaderLocal via " + pathInfo.getPath());
}
return null;
}
return new BlockReaderLocal.Builder(conf).
setFilename(fileName).
setBlock(block).
setStartOffset(startOffset).
setShortCircuitReplica(info.getReplica()).
setDatanodeID(datanode).
setVerifyChecksum(verifyChecksum).
setCachingStrategy(cachingStrategy).
build();
}
/**
* Fetch a pair of short-circuit block descriptors from a local DataNode.
*
* @return Null if we could not communicate with the datanode,
* a new ShortCircuitReplicaInfo object otherwise.
* ShortCircuitReplicaInfo objects may contain either an InvalidToken
* exception, or a ShortCircuitReplica object ready to use.
*/
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
if (createShortCircuitReplicaInfoCallback != null) {
ShortCircuitReplicaInfo info =
createShortCircuitReplicaInfoCallback.createShortCircuitReplicaInfo();
if (info != null) return info;
}
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": trying to create ShortCircuitReplicaInfo.");
}
BlockReaderPeer curPeer;
while (true) {
curPeer = nextDomainPeer();
if (curPeer == null) break;
DomainPeer peer = (DomainPeer)curPeer.peer;
try {
ShortCircuitReplicaInfo info = requestFileDescriptors(peer);
clientContext.getPeerCache().put(datanode, peer);
return info;
} catch (IOException e) {
if (curPeer.fromCache) {
// Handle an I/O error we got when using a cached socket.
// These are considered less serious, because the socket may be stale.
if (LOG.isDebugEnabled()) {
LOG.debug(this + ": closing stale domain peer " + peer, e);
}
IOUtils.cleanup(LOG, peer);
} else {
// Handle an I/O error we got when using a newly created socket.
// We temporarily disable the domain socket path for a few minutes in
// this case, to prevent wasting more time on it.
LOG.warn(this + ": I/O error requesting file descriptors. " +
"Disabling domain socket " + peer.getDomainSocket(), e);
IOUtils.cleanup(LOG, peer);
clientContext.getDomainSocketFactory()
.disableDomainSocketPath(pathInfo.getPath());
return null;
}
}
}
return null;
}
/**
* Request file descriptors from a DomainPeer.
*
* @return A ShortCircuitReplica object if we could communicate with the
* datanode; null, otherwise.
* @throws IOException If we encountered an I/O exception while communicating
* with the datanode.
*/
private ShortCircuitReplicaInfo requestFileDescriptors(DomainPeer peer)
throws IOException {
final DataOutputStream out =
new DataOutputStream(new BufferedOutputStream(
peer.getOutputStream()));
new Sender(out).requestShortCircuitFds(block, blockToken, 1);
DataInputStream in =
new DataInputStream(peer.getInputStream());
new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
new Sender(out).requestShortCircuitFds(block, token, 1);
DataInputStream in = new DataInputStream(peer.getInputStream());
BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
PBHelper.vintPrefixed(in));
DomainSocket sock = peer.getDomainSocket();
switch (resp.getStatus()) {
case SUCCESS:
BlockReaderLocal reader = null;
byte buf[] = new byte[1];
FileInputStream fis[] = new FileInputStream[2];
sock.recvFileInputStreams(fis, buf, 0, buf.length);
ShortCircuitReplica replica = null;
try {
reader = new BlockReaderLocal.Builder(conf).
setFilename(file).
setBlock(block).
setStartOffset(startOffset).
setStreams(fis).
setDatanodeID(datanodeID).
setVerifyChecksum(verifyChecksum).
setBlockMetadataHeader(
BlockMetadataHeader.preadHeader(fis[1].getChannel())).
setFileInputStreamCache(fisCache).
setCachingStrategy(cachingStrategy).
build();
ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
replica = new ShortCircuitReplica(key, fis[0], fis[1],
clientContext.getShortCircuitCache(), Time.monotonicNow());
} catch (IOException e) {
// This indicates an error reading from disk, or a format error. Since
// it's not a socket communication problem, we return null rather than
// throwing an exception.
LOG.warn(this + ": error creating ShortCircuitReplica.", e);
return null;
} finally {
if (reader == null) {
if (replica == null) {
IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);
}
}
return reader;
return new ShortCircuitReplicaInfo(replica);
case ERROR_UNSUPPORTED:
if (!resp.hasShortCircuitAccessVersion()) {
DFSClient.LOG.warn("short-circuit read access is disabled for " +
"DataNode " + datanodeID + ". reason: " + resp.getMessage());
domSockFactory.disableShortCircuitForPath(sock.getPath());
LOG.warn("short-circuit read access is disabled for " +
"DataNode " + datanode + ". reason: " + resp.getMessage());
clientContext.getDomainSocketFactory()
.disableShortCircuitForPath(pathInfo.getPath());
} else {
DFSClient.LOG.warn("short-circuit read access for the file " +
file + " is disabled for DataNode " + datanodeID +
LOG.warn("short-circuit read access for the file " +
fileName + " is disabled for DataNode " + datanode +
". reason: " + resp.getMessage());
}
return null;
case ERROR_ACCESS_TOKEN:
String msg = "access control error while " +
"attempting to set up short-circuit access to " +
file + resp.getMessage();
DFSClient.LOG.debug(msg);
throw new InvalidBlockTokenException(msg);
fileName + resp.getMessage();
if (LOG.isDebugEnabled()) {
LOG.debug(this + ":" + msg);
}
return new ShortCircuitReplicaInfo(new InvalidToken(msg));
default:
DFSClient.LOG.warn("error while attempting to set up short-circuit " +
"access to " + file + ": " + resp.getMessage());
domSockFactory.disableShortCircuitForPath(sock.getPath());
LOG.warn(this + "unknown response code " + resp.getStatus() + " while " +
"attempting to set up short-circuit access. " + resp.getMessage());
clientContext.getDomainSocketFactory()
.disableShortCircuitForPath(pathInfo.getPath());
return null;
}
}
/**
* Get a RemoteBlockReader that communicates over a UNIX domain socket.
*
* @return The new BlockReader, or null if we failed to create the block
* reader.
*
* @throws InvalidToken If the block token was invalid.
* Potentially other security-related execptions.
*/
private BlockReader getRemoteBlockReaderFromDomain() throws IOException {
if (pathInfo == null) {
pathInfo = clientContext.getDomainSocketFactory().
getPathInfo(inetSocketAddress, conf);
}
if (!pathInfo.getPathState().getUsableForDataTransfer()) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": not trying to create a remote block reader " +
"because the UNIX domain socket at " + pathInfo +
" is not usable.");
}
return null;
}
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": trying to create a remote block reader from the " +
"UNIX domain socket at " + pathInfo.getPath());
}
while (true) {
BlockReaderPeer curPeer = nextDomainPeer();
if (curPeer == null) break;
DomainPeer peer = (DomainPeer)curPeer.peer;
BlockReader blockReader = null;
try {
blockReader = getRemoteBlockReader(peer);
return blockReader;
} catch (IOException ioe) {
IOUtils.cleanup(LOG, peer);
if (isSecurityException(ioe)) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": got security exception while constructing " +
"a remote block reader from the unix domain socket at " +
pathInfo.getPath(), ioe);
}
throw ioe;
}
if (curPeer.fromCache) {
// Handle an I/O error we got when using a cached peer. These are
// considered less serious, because the underlying socket may be stale.
if (LOG.isDebugEnabled()) {
LOG.debug("Closed potentially stale domain peer " + peer, ioe);
}
} else {
// Handle an I/O error we got when using a newly created domain peer.
// We temporarily disable the domain socket path for a few minutes in
// this case, to prevent wasting more time on it.
LOG.warn("I/O error constructing remote block reader. Disabling " +
"domain socket " + peer.getDomainSocket(), ioe);
clientContext.getDomainSocketFactory()
.disableDomainSocketPath(pathInfo.getPath());
return null;
}
} finally {
if (blockReader == null) {
IOUtils.cleanup(LOG, peer);
}
}
}
return null;
}
/**
* Get a RemoteBlockReader that communicates over a TCP socket.
*
* @return The new BlockReader. We will not return null, but instead throw
* an exception if this fails.
*
* @throws InvalidToken
* If the block token was invalid.
* InvalidEncryptionKeyException
* If the encryption key was invalid.
* Other IOException
* If there was another problem.
*/
private BlockReader getRemoteBlockReaderFromTcp() throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": trying to create a remote block reader from a " +
"TCP socket");
}
BlockReader blockReader = null;
while (true) {
BlockReaderPeer curPeer = null;
Peer peer = null;
try {
curPeer = nextTcpPeer();
if (curPeer == null) break;
peer = curPeer.peer;
blockReader = getRemoteBlockReader(peer);
return blockReader;
} catch (IOException ioe) {
if (isSecurityException(ioe)) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": got security exception while constructing " +
"a remote block reader from " + peer, ioe);
}
throw ioe;
}
if ((curPeer != null) && curPeer.fromCache) {
// Handle an I/O error we got when using a cached peer. These are
// considered less serious, because the underlying socket may be
// stale.
if (LOG.isDebugEnabled()) {
LOG.debug("Closed potentially stale remote peer " + peer, ioe);
}
} else {
// Handle an I/O error we got when using a newly created peer.
LOG.warn("I/O error constructing remote block reader.", ioe);
throw ioe;
}
} finally {
if (blockReader == null) {
IOUtils.cleanup(LOG, peer);
}
}
}
return null;
}
private static class BlockReaderPeer {
final Peer peer;
final boolean fromCache;
BlockReaderPeer(Peer peer, boolean fromCache) {
this.peer = peer;
this.fromCache = fromCache;
}
}
/**
* Get the next DomainPeer-- either from the cache or by creating it.
*
* @return the next DomainPeer, or null if we could not construct one.
*/
private BlockReaderPeer nextDomainPeer() {
if (remainingCacheTries > 0) {
Peer peer = clientContext.getPeerCache().get(datanode, true);
if (peer != null) {
remainingCacheTries--;
if (LOG.isTraceEnabled()) {
LOG.trace("nextDomainPeer: reusing existing peer " + peer);
}
return new BlockReaderPeer(peer, true);
}
}
DomainSocket sock = clientContext.getDomainSocketFactory().
createSocket(pathInfo, conf.socketTimeout);
if (sock == null) return null;
return new BlockReaderPeer(new DomainPeer(sock), false);
}
/**
* Get the next TCP-based peer-- either from the cache or by creating it.
*
* @return the next Peer, or null if we could not construct one.
*
* @throws IOException If there was an error while constructing the peer
* (such as an InvalidEncryptionKeyException)
*/
private BlockReaderPeer nextTcpPeer() throws IOException {
if (remainingCacheTries > 0) {
Peer peer = clientContext.getPeerCache().get(datanode, false);
if (peer != null) {
remainingCacheTries--;
if (LOG.isTraceEnabled()) {
LOG.trace("nextTcpPeer: reusing existing peer " + peer);
}
return new BlockReaderPeer(peer, true);
}
}
try {
Peer peer = remotePeerFactory.newConnectedPeer(inetSocketAddress);
if (LOG.isTraceEnabled()) {
LOG.trace("nextTcpPeer: created newConnectedPeer " + peer);
}
return new BlockReaderPeer(peer, false);
} catch (IOException e) {
if (LOG.isTraceEnabled()) {
LOG.trace("nextTcpPeer: failed to create newConnectedPeer " +
"connected to " + datanode);
}
throw e;
}
}
/**
* Determine if an exception is security-related.
*
* We need to handle these exceptions differently than other IOExceptions.
* They don't indicate a communication problem. Instead, they mean that there
* is some action the client needs to take, such as refetching block tokens,
* renewing encryption keys, etc.
*
* @param ioe The exception
* @return True only if the exception is security-related.
*/
private static boolean isSecurityException(IOException ioe) {
return (ioe instanceof InvalidToken) ||
(ioe instanceof InvalidEncryptionKeyException) ||
(ioe instanceof InvalidBlockTokenException) ||
(ioe instanceof AccessControlException);
}
@SuppressWarnings("deprecation")
private BlockReader getRemoteBlockReader(Peer peer) throws IOException {
if (conf.useLegacyBlockReader) {
return RemoteBlockReader.newBlockReader(fileName,
block, token, startOffset, length, conf.ioBufferSize,
verifyChecksum, clientName, peer, datanode,
clientContext.getPeerCache(), cachingStrategy);
} else {
return RemoteBlockReader2.newBlockReader(
fileName, block, token, startOffset, length,
verifyChecksum, clientName, peer, datanode,
clientContext.getPeerCache(), cachingStrategy);
}
}
@Override
public String toString() {
return "BlockReaderFactory(fileName=" + fileName + ", block=" + block + ")";
}
/**
* File name to print when accessing a block directly (from servlets)
* @param s Address of the block location
@ -246,23 +777,4 @@ public static String getFileName(final InetSocketAddress s,
final String poolId, final long blockId) {
return s.toString() + ":" + poolId + ":" + blockId;
}
/**
* Get {@link BlockReaderLocalLegacy} for short circuited local reads.
* This block reader implements the path-based style of local reads
* first introduced in HDFS-2246.
*/
static BlockReader getLegacyBlockReaderLocal(DFSClient dfsClient,
String src, ExtendedBlock blk,
Token<BlockTokenIdentifier> accessToken, DatanodeInfo chosenNode,
long offsetIntoBlock) throws InvalidToken, IOException {
try {
final long length = blk.getNumBytes() - offsetIntoBlock;
return BlockReaderLocalLegacy.newBlockReader(dfsClient, src, blk,
accessToken, chosenNode, offsetIntoBlock, length);
} catch (RemoteException re) {
throw re.unwrapRemoteException(InvalidToken.class,
AccessControlException.class);
}
}
}

View File

@ -28,8 +28,9 @@
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.DFSClient.Conf;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
@ -67,12 +68,10 @@ public static class Builder {
private boolean verifyChecksum;
private int maxReadahead;
private String filename;
private FileInputStream streams[];
private ShortCircuitReplica replica;
private long dataPos;
private DatanodeID datanodeID;
private FileInputStreamCache fisCache;
private boolean mlocked;
private BlockMetadataHeader header;
private ExtendedBlock block;
public Builder(Conf conf) {
@ -99,8 +98,8 @@ public Builder setFilename(String filename) {
return this;
}
public Builder setStreams(FileInputStream streams[]) {
this.streams = streams;
public Builder setShortCircuitReplica(ShortCircuitReplica replica) {
this.replica = replica;
return this;
}
@ -114,30 +113,18 @@ public Builder setDatanodeID(DatanodeID datanodeID) {
return this;
}
public Builder setFileInputStreamCache(FileInputStreamCache fisCache) {
this.fisCache = fisCache;
return this;
}
public Builder setMlocked(boolean mlocked) {
this.mlocked = mlocked;
return this;
}
public Builder setBlockMetadataHeader(BlockMetadataHeader header) {
this.header = header;
return this;
}
public Builder setBlock(ExtendedBlock block) {
this.block = block;
return this;
}
public BlockReaderLocal build() {
Preconditions.checkNotNull(streams);
Preconditions.checkArgument(streams.length == 2);
Preconditions.checkNotNull(header);
Preconditions.checkNotNull(replica);
return new BlockReaderLocal(this);
}
}
@ -147,7 +134,7 @@ public BlockReaderLocal build() {
/**
* Pair of streams for this block.
*/
private final FileInputStream streams[];
private final ShortCircuitReplica replica;
/**
* The data FileChannel.
@ -207,12 +194,6 @@ public BlockReaderLocal build() {
*/
private int checksumSize;
/**
* FileInputStream cache to return the streams to upon closing,
* or null if we should just close them unconditionally.
*/
private final FileInputStreamCache fisCache;
/**
* Maximum number of chunks to allocate.
*
@ -257,20 +238,18 @@ public BlockReaderLocal build() {
*/
private ByteBuffer checksumBuf;
private boolean mmapDisabled = false;
private BlockReaderLocal(Builder builder) {
this.streams = builder.streams;
this.dataIn = builder.streams[0].getChannel();
this.replica = builder.replica;
this.dataIn = replica.getDataStream().getChannel();
this.dataPos = builder.dataPos;
this.checksumIn = builder.streams[1].getChannel();
this.checksum = builder.header.getChecksum();
this.checksumIn = replica.getMetaStream().getChannel();
BlockMetadataHeader header = builder.replica.getMetaHeader();
this.checksum = header.getChecksum();
this.verifyChecksum = builder.verifyChecksum &&
(this.checksum.getChecksumType().id != DataChecksum.CHECKSUM_NULL);
this.mlocked = new AtomicBoolean(builder.mlocked);
this.filename = builder.filename;
this.datanodeID = builder.datanodeID;
this.fisCache = builder.fisCache;
this.block = builder.block;
this.bytesPerChecksum = checksum.getBytesPerChecksum();
this.checksumSize = checksum.getChecksumSize();
@ -642,20 +621,7 @@ public synchronized void close() throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace("close(filename=" + filename + ", block=" + block + ")");
}
if (clientMmap != null) {
clientMmap.unref();
clientMmap = null;
}
if (fisCache != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("putting FileInputStream for " + filename +
" back into FileInputStreamCache");
}
fisCache.put(datanodeID, block, streams);
} else {
LOG.debug("closing FileInputStream for " + filename);
IOUtils.cleanup(LOG, dataIn, checksumIn);
}
replica.unref();
freeDataBufIfExists();
freeChecksumBufIfExists();
}
@ -683,8 +649,7 @@ public boolean isShortCircuit() {
}
@Override
public synchronized ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager) {
public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
if ((!opts.contains(ReadOption.SKIP_CHECKSUMS)) &&
verifyChecksum && (!mlocked.get())) {
if (LOG.isTraceEnabled()) {
@ -694,27 +659,7 @@ public synchronized ClientMmap getClientMmap(EnumSet<ReadOption> opts,
}
return null;
}
if (clientMmap == null) {
if (mmapDisabled) {
return null;
}
try {
clientMmap = mmapManager.fetch(datanodeID, block, streams[0]);
if (clientMmap == null) {
mmapDisabled = true;
return null;
}
} catch (InterruptedException e) {
LOG.error("Interrupted while setting up mmap for " + filename, e);
Thread.currentThread().interrupt();
return null;
} catch (IOException e) {
LOG.error("unable to set up mmap for " + filename, e);
mmapDisabled = true;
return null;
}
}
return clientMmap;
return replica.getOrCreateClientMmap();
}
/**

View File

@ -31,7 +31,6 @@
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
@ -175,19 +174,21 @@ private void removeBlockLocalPathInfo(ExtendedBlock b) {
/**
* The only way this object can be instantiated.
*/
static BlockReaderLocalLegacy newBlockReader(DFSClient dfsClient,
String file, ExtendedBlock blk, Token<BlockTokenIdentifier> token,
DatanodeInfo node, long startOffset, long length)
throws IOException {
final DFSClient.Conf conf = dfsClient.getConf();
static BlockReaderLocalLegacy newBlockReader(DFSClient.Conf conf,
UserGroupInformation userGroupInformation,
Configuration configuration, String file, ExtendedBlock blk,
Token<BlockTokenIdentifier> token, DatanodeInfo node,
long startOffset, long length) throws IOException {
LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node
.getIpcPort());
// check the cache first
BlockLocalPathInfo pathinfo = localDatanodeInfo.getBlockLocalPathInfo(blk);
if (pathinfo == null) {
pathinfo = getBlockPathInfo(dfsClient.ugi, blk, node,
dfsClient.getConfiguration(), dfsClient.getHdfsTimeout(), token,
if (userGroupInformation == null) {
userGroupInformation = UserGroupInformation.getCurrentUser();
}
pathinfo = getBlockPathInfo(userGroupInformation, blk, node,
configuration, conf.hdfsTimeout, token,
conf.connectToDnViaHostname);
}
@ -708,8 +709,7 @@ public boolean isShortCircuit() {
}
@Override
public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager) {
public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
return null;
}
}

View File

@ -0,0 +1,204 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import com.google.common.annotations.VisibleForTesting;
import java.util.HashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSClient.Conf;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
/**
* ClientContext contains context information for a client.
*
* This allows us to share caches such as the socket cache across
* DFSClient instances.
*/
@InterfaceAudience.Private
public class ClientContext {
private static final Log LOG = LogFactory.getLog(ClientContext.class);
/**
* Global map of context names to caches contexts.
*/
private final static HashMap<String, ClientContext> CACHES =
new HashMap<String, ClientContext>();
/**
* Name of context.
*/
private final String name;
/**
* String representation of the configuration.
*/
private final String confString;
/**
* Caches short-circuit file descriptors, mmap regions.
*/
private final ShortCircuitCache shortCircuitCache;
/**
* Caches TCP and UNIX domain sockets for reuse.
*/
private final PeerCache peerCache;
/**
* Stores information about socket paths.
*/
private final DomainSocketFactory domainSocketFactory;
/**
* True if we should use the legacy BlockReaderLocal.
*/
private final boolean useLegacyBlockReaderLocal;
/**
* True if the legacy BlockReaderLocal is disabled.
*
* The legacy block reader local gets disabled completely whenever there is an
* error or miscommunication. The new block reader local code handles this
* case more gracefully inside DomainSocketFactory.
*/
private volatile boolean disableLegacyBlockReaderLocal = false;
/**
* Whether or not we complained about a DFSClient fetching a CacheContext that
* didn't match its config values yet.
*/
private boolean printedConfWarning = false;
private ClientContext(String name, Conf conf) {
this.name = name;
this.confString = confAsString(conf);
this.shortCircuitCache = new ShortCircuitCache(
conf.shortCircuitStreamsCacheSize,
conf.shortCircuitStreamsCacheExpiryMs,
conf.shortCircuitMmapCacheSize,
conf.shortCircuitMmapCacheExpiryMs,
conf.shortCircuitMmapCacheRetryTimeout,
conf.shortCircuitCacheStaleThresholdMs);
this.peerCache =
new PeerCache(conf.socketCacheCapacity, conf.socketCacheExpiry);
this.useLegacyBlockReaderLocal = conf.useLegacyBlockReaderLocal;
this.domainSocketFactory = new DomainSocketFactory(conf);
}
public static String confAsString(Conf conf) {
StringBuilder builder = new StringBuilder();
builder.append("shortCircuitStreamsCacheSize = ").
append(conf.shortCircuitStreamsCacheSize).
append(", shortCircuitStreamsCacheExpiryMs = ").
append(conf.shortCircuitStreamsCacheExpiryMs).
append(", shortCircuitMmapCacheSize = ").
append(conf.shortCircuitMmapCacheSize).
append(", shortCircuitMmapCacheExpiryMs = ").
append(conf.shortCircuitMmapCacheExpiryMs).
append(", shortCircuitMmapCacheRetryTimeout = ").
append(conf.shortCircuitMmapCacheRetryTimeout).
append(", shortCircuitCacheStaleThresholdMs = ").
append(conf.shortCircuitCacheStaleThresholdMs).
append(", socketCacheCapacity = ").
append(conf.socketCacheCapacity).
append(", socketCacheExpiry = ").
append(conf.socketCacheExpiry).
append(", shortCircuitLocalReads = ").
append(conf.shortCircuitLocalReads).
append(", useLegacyBlockReaderLocal = ").
append(conf.useLegacyBlockReaderLocal).
append(", domainSocketDataTraffic = ").
append(conf.domainSocketDataTraffic);
return builder.toString();
}
public static ClientContext get(String name, Conf conf) {
ClientContext context;
synchronized(ClientContext.class) {
context = CACHES.get(name);
if (context == null) {
context = new ClientContext(name, conf);
CACHES.put(name, context);
} else {
context.printConfWarningIfNeeded(conf);
}
}
return context;
}
/**
* Get a client context, from a Configuration object.
*
* This method is less efficient than the version which takes a DFSClient#Conf
* object, and should be mostly used by tests.
*/
@VisibleForTesting
public static ClientContext getFromConf(Configuration conf) {
return get(conf.get(DFSConfigKeys.DFS_CLIENT_CONTEXT,
DFSConfigKeys.DFS_CLIENT_CONTEXT_DEFAULT),
new DFSClient.Conf(conf));
}
private void printConfWarningIfNeeded(Conf conf) {
String existing = this.getConfString();
String requested = confAsString(conf);
if (!existing.equals(requested)) {
if (!printedConfWarning) {
printedConfWarning = true;
LOG.warn("Existing client context '" + name + "' does not match " +
"requested configuration. Existing: " + existing +
", Requested: " + requested);
}
}
}
public String getConfString() {
return confString;
}
public ShortCircuitCache getShortCircuitCache() {
return shortCircuitCache;
}
public PeerCache getPeerCache() {
return peerCache;
}
public boolean getUseLegacyBlockReaderLocal() {
return useLegacyBlockReaderLocal;
}
public boolean getDisableLegacyBlockReaderLocal() {
return disableLegacyBlockReaderLocal;
}
public void setDisableLegacyBlockReaderLocal() {
disableLegacyBlockReaderLocal = true;
}
public DomainSocketFactory getDomainSocketFactory() {
return domainSocketFactory;
}
}

View File

@ -56,6 +56,8 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_WRITE_TIMEOUT_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_REPLICATION_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT_DEFAULT;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
@ -110,10 +112,11 @@
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclStatus;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
import org.apache.hadoop.hdfs.client.HdfsDataOutputStream;
import org.apache.hadoop.hdfs.protocol.AclException;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveIterator;
@ -192,7 +195,7 @@
*
********************************************************/
@InterfaceAudience.Private
public class DFSClient implements java.io.Closeable {
public class DFSClient implements java.io.Closeable, RemotePeerFactory {
public static final Log LOG = LogFactory.getLog(DFSClient.class);
public static final long SERVER_DEFAULTS_VALIDITY_PERIOD = 60 * 60 * 1000L; // 1 hour
static final int TCP_WINDOW_SIZE = 128 * 1024; // 128 KB
@ -213,50 +216,13 @@ public class DFSClient implements java.io.Closeable {
final ReplaceDatanodeOnFailure dtpReplaceDatanodeOnFailure;
final FileSystem.Statistics stats;
private final String authority;
final PeerCache peerCache;
private Random r = new Random();
private SocketAddress[] localInterfaceAddrs;
private DataEncryptionKey encryptionKey;
private boolean shouldUseLegacyBlockReaderLocal;
private final CachingStrategy defaultReadCachingStrategy;
private final CachingStrategy defaultWriteCachingStrategy;
private ClientMmapManager mmapManager;
private final ClientContext clientContext;
private static final ClientMmapManagerFactory MMAP_MANAGER_FACTORY =
new ClientMmapManagerFactory();
private static final class ClientMmapManagerFactory {
private ClientMmapManager mmapManager = null;
/**
* Tracks the number of users of mmapManager.
*/
private int refcnt = 0;
synchronized ClientMmapManager get(Configuration conf) {
if (refcnt++ == 0) {
mmapManager = ClientMmapManager.fromConf(conf);
} else {
String mismatches = mmapManager.verifyConfigurationMatches(conf);
if (!mismatches.isEmpty()) {
LOG.warn("The ClientMmapManager settings you specified " +
"have been ignored because another thread created the " +
"ClientMmapManager first. " + mismatches);
}
}
return mmapManager;
}
synchronized void unref(ClientMmapManager mmapManager) {
if (this.mmapManager != mmapManager) {
throw new IllegalArgumentException();
}
if (--refcnt == 0) {
IOUtils.cleanup(LOG, mmapManager);
mmapManager = null;
}
}
}
/**
* DFSClient configuration
*/
@ -301,6 +267,11 @@ public static class Conf {
final boolean domainSocketDataTraffic;
final int shortCircuitStreamsCacheSize;
final long shortCircuitStreamsCacheExpiryMs;
final int shortCircuitMmapCacheSize;
final long shortCircuitMmapCacheExpiryMs;
final long shortCircuitMmapCacheRetryTimeout;
final long shortCircuitCacheStaleThresholdMs;
public Conf(Configuration conf) {
// The hdfsTimeout is currently the same as the ipc timeout
@ -417,6 +388,18 @@ public Conf(Configuration conf) {
shortCircuitStreamsCacheExpiryMs = conf.getLong(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT);
shortCircuitMmapCacheSize = conf.getInt(
DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT);
shortCircuitMmapCacheExpiryMs = conf.getLong(
DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT);
shortCircuitMmapCacheRetryTimeout = conf.getLong(
DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT);
shortCircuitCacheStaleThresholdMs = conf.getLong(
DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT);
}
private DataChecksum.Type getChecksumType(Configuration conf) {
@ -465,11 +448,11 @@ private DataChecksum createChecksum(ChecksumOpt userOpt)
public Conf getConf() {
return dfsClientConf;
}
Configuration getConfiguration() {
return conf;
}
/**
* A map from file names to {@link DFSOutputStream} objects
* that are currently being written by this client.
@ -478,8 +461,6 @@ Configuration getConfiguration() {
private final Map<String, DFSOutputStream> filesBeingWritten
= new HashMap<String, DFSOutputStream>();
private final DomainSocketFactory domainSocketFactory;
/**
* Same as this(NameNode.getAddress(conf), conf);
* @see #DFSClient(InetSocketAddress, Configuration)
@ -527,8 +508,6 @@ public DFSClient(URI nameNodeUri, ClientProtocol rpcNamenode,
throws IOException {
// Copy only the required DFSClient configuration
this.dfsClientConf = new Conf(conf);
this.shouldUseLegacyBlockReaderLocal =
this.dfsClientConf.useLegacyBlockReaderLocal;
if (this.dfsClientConf.useLegacyBlockReaderLocal) {
LOG.debug("Using legacy short-circuit local reads.");
}
@ -573,9 +552,6 @@ public DFSClient(URI nameNodeUri, ClientProtocol rpcNamenode,
this.namenode = proxyInfo.getProxy();
}
// read directly from the block file if configured.
this.domainSocketFactory = new DomainSocketFactory(dfsClientConf);
String localInterfaces[] =
conf.getTrimmedStrings(DFSConfigKeys.DFS_CLIENT_LOCAL_INTERFACES);
localInterfaceAddrs = getLocalInterfaceAddrs(localInterfaces);
@ -585,7 +561,6 @@ public DFSClient(URI nameNodeUri, ClientProtocol rpcNamenode,
Joiner.on(',').join(localInterfaceAddrs) + "]");
}
this.peerCache = PeerCache.getInstance(dfsClientConf.socketCacheCapacity, dfsClientConf.socketCacheExpiry);
Boolean readDropBehind = (conf.get(DFS_CLIENT_CACHE_DROP_BEHIND_READS) == null) ?
null : conf.getBoolean(DFS_CLIENT_CACHE_DROP_BEHIND_READS, false);
Long readahead = (conf.get(DFS_CLIENT_CACHE_READAHEAD) == null) ?
@ -596,7 +571,9 @@ public DFSClient(URI nameNodeUri, ClientProtocol rpcNamenode,
new CachingStrategy(readDropBehind, readahead);
this.defaultWriteCachingStrategy =
new CachingStrategy(writeDropBehind, readahead);
this.mmapManager = MMAP_MANAGER_FACTORY.get(conf);
this.clientContext = ClientContext.get(
conf.get(DFS_CLIENT_CONTEXT, DFS_CLIENT_CONTEXT_DEFAULT),
dfsClientConf);
}
/**
@ -801,10 +778,6 @@ void closeConnectionToNamenode() {
/** Abort and release resources held. Ignore all errors. */
void abort() {
if (mmapManager != null) {
MMAP_MANAGER_FACTORY.unref(mmapManager);
mmapManager = null;
}
clientRunning = false;
closeAllFilesBeingWritten(true);
try {
@ -850,10 +823,6 @@ private void closeAllFilesBeingWritten(final boolean abort) {
*/
@Override
public synchronized void close() throws IOException {
if (mmapManager != null) {
MMAP_MANAGER_FACTORY.unref(mmapManager);
mmapManager = null;
}
if(clientRunning) {
closeAllFilesBeingWritten(false);
clientRunning = false;
@ -2623,18 +2592,6 @@ public String toString() {
+ ", ugi=" + ugi + "]";
}
public DomainSocketFactory getDomainSocketFactory() {
return domainSocketFactory;
}
public void disableLegacyBlockReaderLocal() {
shouldUseLegacyBlockReaderLocal = false;
}
public boolean useLegacyBlockReaderLocal() {
return shouldUseLegacyBlockReaderLocal;
}
public CachingStrategy getDefaultReadCachingStrategy() {
return defaultReadCachingStrategy;
}
@ -2643,9 +2600,8 @@ public CachingStrategy getDefaultWriteCachingStrategy() {
return defaultWriteCachingStrategy;
}
@VisibleForTesting
public ClientMmapManager getMmapManager() {
return mmapManager;
public ClientContext getClientContext() {
return clientContext;
}
void modifyAclEntries(String src, List<AclEntry> aclSpec)
@ -2736,4 +2692,26 @@ AclStatus getAclStatus(String src) throws IOException {
UnresolvedPathException.class);
}
}
@Override // RemotePeerFactory
public Peer newConnectedPeer(InetSocketAddress addr) throws IOException {
Peer peer = null;
boolean success = false;
Socket sock = null;
try {
sock = socketFactory.createSocket();
NetUtils.connect(sock, addr,
getRandomLocalInterfaceAddr(),
dfsClientConf.socketTimeout);
peer = TcpPeerServer.peerFromSocketAndKey(sock,
getDataEncryptionKey());
success = true;
return peer;
} finally {
if (!success) {
IOUtils.cleanup(LOG, peer);
IOUtils.closeSocket(sock);
}
}
}
}

View File

@ -59,6 +59,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final String DFS_CLIENT_CACHE_DROP_BEHIND_WRITES = "dfs.client.cache.drop.behind.writes";
public static final String DFS_CLIENT_CACHE_DROP_BEHIND_READS = "dfs.client.cache.drop.behind.reads";
public static final String DFS_CLIENT_CACHE_READAHEAD = "dfs.client.cache.readahead";
public static final String DFS_CLIENT_CONTEXT = "dfs.client.context";
public static final String DFS_CLIENT_CONTEXT_DEFAULT = "default";
public static final String DFS_HDFS_BLOCKS_METADATA_ENABLED = "dfs.datanode.hdfs-blocks-metadata.enabled";
public static final boolean DFS_HDFS_BLOCKS_METADATA_ENABLED_DEFAULT = false;
public static final String DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_NUM_THREADS = "dfs.client.file-block-storage-locations.num-threads";
@ -420,18 +422,20 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final boolean DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT = false;
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY = "dfs.client.read.shortcircuit.buffer.size";
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY = "dfs.client.read.shortcircuit.streams.cache.size";
public static final int DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT = 100;
public static final int DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT = 256;
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY = "dfs.client.read.shortcircuit.streams.cache.expiry.ms";
public static final long DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT = 5000;
public static final long DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT = 5 * 60 * 1000;
public static final int DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT = 1024 * 1024;
public static final String DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC = "dfs.client.domain.socket.data.traffic";
public static final boolean DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT = false;
public static final String DFS_CLIENT_MMAP_CACHE_SIZE = "dfs.client.mmap.cache.size";
public static final int DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT = 1024;
public static final int DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT = 256;
public static final String DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS = "dfs.client.mmap.cache.timeout.ms";
public static final long DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT = 15 * 60 * 1000;
public static final String DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT = "dfs.client.mmap.cache.thread.runs.per.timeout";
public static final int DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT = 4;
public static final long DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT = 60 * 60 * 1000;
public static final String DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS = "dfs.client.mmap.retry.timeout.ms";
public static final long DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT = 5 * 60 * 1000;
public static final String DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS = "dfs.client.short.circuit.replica.stale.threshold.ms";
public static final long DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT = 30 * 60 * 1000;
// property for fsimage compression
public static final String DFS_IMAGE_COMPRESS_KEY = "dfs.image.compress";

View File

@ -46,9 +46,6 @@
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.net.DomainPeer;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -82,7 +79,6 @@ public class DFSInputStream extends FSInputStream
HasEnhancedByteBufferAccess {
@VisibleForTesting
static boolean tcpReadsDisabledForTesting = false;
private final PeerCache peerCache;
private final DFSClient dfsClient;
private boolean closed = false;
private final String src;
@ -190,8 +186,6 @@ void addZeroCopyBytes(long amt) {
private long totalZeroCopyBytesRead;
}
private final FileInputStreamCache fileInputStreamCache;
/**
* This variable tracks the number of failures since the start of the
* most recent user-facing operation. That is to say, it should be reset
@ -223,10 +217,6 @@ void addToDeadNodes(DatanodeInfo dnInfo) {
this.verifyChecksum = verifyChecksum;
this.buffersize = buffersize;
this.src = src;
this.peerCache = dfsClient.peerCache;
this.fileInputStreamCache = new FileInputStreamCache(
dfsClient.getConf().shortCircuitStreamsCacheSize,
dfsClient.getConf().shortCircuitStreamsCacheExpiryMs);
this.cachingStrategy =
dfsClient.getDefaultReadCachingStrategy();
openInfo();
@ -572,18 +562,28 @@ private synchronized DatanodeInfo blockSeekTo(long target) throws IOException {
try {
ExtendedBlock blk = targetBlock.getBlock();
Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
blockReader = getBlockReader(targetAddr, chosenNode, src, blk,
accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock,
buffersize, verifyChecksum, dfsClient.clientName, cachingStrategy);
blockReader = new BlockReaderFactory(dfsClient.getConf()).
setInetSocketAddress(targetAddr).
setRemotePeerFactory(dfsClient).
setDatanodeInfo(chosenNode).
setFileName(src).
setBlock(blk).
setBlockToken(accessToken).
setStartOffset(offsetIntoBlock).
setVerifyChecksum(verifyChecksum).
setClientName(dfsClient.clientName).
setLength(blk.getNumBytes() - offsetIntoBlock).
setCachingStrategy(cachingStrategy).
setAllowShortCircuitLocalReads(!shortCircuitForbidden()).
setClientCacheContext(dfsClient.getClientContext()).
setUserGroupInformation(dfsClient.ugi).
setConfiguration(dfsClient.getConfiguration()).
build();
if(connectFailedOnce) {
DFSClient.LOG.info("Successfully connected to " + targetAddr +
" for " + blk);
}
return chosenNode;
} catch (AccessControlException ex) {
DFSClient.LOG.warn("Short circuit access failed " + ex);
dfsClient.disableLegacyBlockReaderLocal();
continue;
} catch (IOException ex) {
if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
DFSClient.LOG.info("Will fetch a new encryption key and retry, "
@ -635,7 +635,6 @@ public void accept(ByteBuffer k, Object v) {
blockReader = null;
}
super.close();
fileInputStreamCache.close();
closed = true;
}
@ -932,9 +931,11 @@ private void fetchBlockByteRange(LocatedBlock block, long start, long end,
// or fetchBlockAt(). Always get the latest list of locations at the
// start of the loop.
CachingStrategy curCachingStrategy;
boolean allowShortCircuitLocalReads;
synchronized (this) {
block = getBlockAt(block.getStartOffset(), false);
curCachingStrategy = cachingStrategy;
allowShortCircuitLocalReads = !shortCircuitForbidden();
}
DNAddrPair retval = chooseDataNode(block);
DatanodeInfo chosenNode = retval.info;
@ -943,11 +944,24 @@ private void fetchBlockByteRange(LocatedBlock block, long start, long end,
try {
Token<BlockTokenIdentifier> blockToken = block.getBlockToken();
int len = (int) (end - start + 1);
reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(),
blockToken, start, len, buffersize, verifyChecksum,
dfsClient.clientName, curCachingStrategy);
reader = new BlockReaderFactory(dfsClient.getConf()).
setInetSocketAddress(targetAddr).
setRemotePeerFactory(dfsClient).
setDatanodeInfo(chosenNode).
setFileName(src).
setBlock(block.getBlock()).
setBlockToken(blockToken).
setStartOffset(start).
setVerifyChecksum(verifyChecksum).
setClientName(dfsClient.clientName).
setLength(len).
setCachingStrategy(curCachingStrategy).
setAllowShortCircuitLocalReads(allowShortCircuitLocalReads).
setClientCacheContext(dfsClient.getClientContext()).
setUserGroupInformation(dfsClient.ugi).
setConfiguration(dfsClient.getConfiguration()).
build();
int nread = reader.readAll(buf, offset, len);
if (nread != len) {
throw new IOException("truncated return from reader.read(): " +
@ -960,10 +974,6 @@ private void fetchBlockByteRange(LocatedBlock block, long start, long end,
e.getPos() + " from " + chosenNode);
// we want to remember what we have tried
addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
} catch (AccessControlException ex) {
DFSClient.LOG.warn("Short circuit access failed " + ex);
dfsClient.disableLegacyBlockReaderLocal();
continue;
} catch (IOException e) {
if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
DFSClient.LOG.info("Will fetch a new encryption key and retry, "
@ -1022,194 +1032,6 @@ private static boolean tokenRefetchNeeded(IOException ex,
return false;
}
private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
Peer peer = null;
boolean success = false;
Socket sock = null;
try {
sock = dfsClient.socketFactory.createSocket();
NetUtils.connect(sock, addr,
dfsClient.getRandomLocalInterfaceAddr(),
dfsClient.getConf().socketTimeout);
peer = TcpPeerServer.peerFromSocketAndKey(sock,
dfsClient.getDataEncryptionKey());
success = true;
return peer;
} finally {
if (!success) {
IOUtils.closeQuietly(peer);
IOUtils.closeQuietly(sock);
}
}
}
/**
* Retrieve a BlockReader suitable for reading.
* This method will reuse the cached connection to the DN if appropriate.
* Otherwise, it will create a new connection.
* Throwing an IOException from this method is basically equivalent to
* declaring the DataNode bad, so we try to connect a lot of different ways
* before doing that.
*
* @param dnAddr Address of the datanode
* @param chosenNode Chosen datanode information
* @param file File location
* @param block The Block object
* @param blockToken The access token for security
* @param startOffset The read offset, relative to block head
* @param len The number of bytes to read
* @param bufferSize The IO buffer size (not the client buffer size)
* @param verifyChecksum Whether to verify checksum
* @param clientName Client name
* @param CachingStrategy caching strategy to use
* @return New BlockReader instance
*/
protected BlockReader getBlockReader(InetSocketAddress dnAddr,
DatanodeInfo chosenNode,
String file,
ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken,
long startOffset,
long len,
int bufferSize,
boolean verifyChecksum,
String clientName,
CachingStrategy curCachingStrategy)
throws IOException {
// Firstly, we check to see if we have cached any file descriptors for
// local blocks. If so, we can just re-use those file descriptors.
FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block);
if (fis != null) {
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
"the FileInputStreamCache.");
}
return new BlockReaderLocal.Builder(dfsClient.getConf()).
setFilename(file).
setBlock(block).
setStartOffset(startOffset).
setStreams(fis).
setDatanodeID(chosenNode).
setVerifyChecksum(verifyChecksum).
setBlockMetadataHeader(BlockMetadataHeader.
preadHeader(fis[1].getChannel())).
setFileInputStreamCache(fileInputStreamCache).
setCachingStrategy(curCachingStrategy).
build();
}
// If the legacy local block reader is enabled and we are reading a local
// block, try to create a BlockReaderLocalLegacy. The legacy local block
// reader implements local reads in the style first introduced by HDFS-2246.
if ((dfsClient.useLegacyBlockReaderLocal()) &&
DFSClient.isLocalAddress(dnAddr) &&
(!shortCircuitForbidden())) {
try {
return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient,
clientName, block, blockToken, chosenNode, startOffset);
} catch (IOException e) {
DFSClient.LOG.warn("error creating legacy BlockReaderLocal. " +
"Disabling legacy local reads.", e);
dfsClient.disableLegacyBlockReaderLocal();
}
}
// Look for cached domain peers.
int cacheTries = 0;
DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory();
BlockReader reader = null;
final int nCachedConnRetry = dfsClient.getConf().nCachedConnRetry;
for (; cacheTries < nCachedConnRetry; ++cacheTries) {
Peer peer = peerCache.get(chosenNode, true);
if (peer == null) break;
try {
boolean allowShortCircuitLocalReads = dfsClient.getConf().
shortCircuitLocalReads && (!shortCircuitForbidden());
reader = BlockReaderFactory.newBlockReader(
dfsClient.getConf(), file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, peerCache, fileInputStreamCache,
allowShortCircuitLocalReads, curCachingStrategy);
return reader;
} catch (IOException ex) {
DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
"Closing stale " + peer, ex);
} finally {
if (reader == null) {
IOUtils.closeQuietly(peer);
}
}
}
// Try to create a DomainPeer.
DomainSocket domSock = dsFactory.create(dnAddr, this);
if (domSock != null) {
Peer peer = new DomainPeer(domSock);
try {
boolean allowShortCircuitLocalReads = dfsClient.getConf().
shortCircuitLocalReads && (!shortCircuitForbidden());
reader = BlockReaderFactory.newBlockReader(
dfsClient.getConf(), file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, peerCache, fileInputStreamCache,
allowShortCircuitLocalReads, curCachingStrategy);
return reader;
} catch (IOException e) {
DFSClient.LOG.warn("failed to connect to " + domSock, e);
} finally {
if (reader == null) {
// If the Peer that we got the error from was a DomainPeer,
// mark the socket path as bad, so that newDataSocket will not try
// to re-open this socket for a while.
dsFactory.disableDomainSocketPath(domSock.getPath());
IOUtils.closeQuietly(peer);
}
}
}
// Look for cached peers.
for (; cacheTries < nCachedConnRetry; ++cacheTries) {
Peer peer = peerCache.get(chosenNode, false);
if (peer == null) break;
try {
reader = BlockReaderFactory.newBlockReader(
dfsClient.getConf(), file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, peerCache, fileInputStreamCache, false,
curCachingStrategy);
return reader;
} catch (IOException ex) {
DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
peer, ex);
} finally {
if (reader == null) {
IOUtils.closeQuietly(peer);
}
}
}
if (tcpReadsDisabledForTesting) {
throw new IOException("TCP reads are disabled.");
}
// Try to create a new remote peer.
Peer peer = newTcpPeer(dnAddr);
try {
reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file,
block, blockToken, startOffset, len, verifyChecksum, clientName,
peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false,
curCachingStrategy);
return reader;
} catch (IOException ex) {
DFSClient.LOG.debug(
"Exception while getting block reader, closing stale " + peer, ex);
throw ex;
} finally {
if (reader == null) {
IOUtils.closeQuietly(peer);
}
}
}
/**
* Read bytes starting from the specified position.
*
@ -1555,8 +1377,7 @@ private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
long blockStartInFile = currentLocatedBlock.getStartOffset();
long blockPos = curPos - blockStartInFile;
long limit = blockPos + length;
ClientMmap clientMmap =
blockReader.getClientMmap(opts, dfsClient.getMmapManager());
ClientMmap clientMmap = blockReader.getClientMmap(opts);
if (clientMmap == null) {
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +
@ -1565,17 +1386,25 @@ private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
}
return null;
}
seek(pos + length);
ByteBuffer buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
buffer.position((int)blockPos);
buffer.limit((int)limit);
clientMmap.ref();
extendedReadBuffers.put(buffer, clientMmap);
readStatistics.addZeroCopyBytes(length);
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " +
"offset " + curPos + " via the zero-copy read path. " +
"blockEnd = " + blockEnd);
boolean success = false;
ByteBuffer buffer;
try {
seek(pos + length);
buffer = clientMmap.getMappedByteBuffer().asReadOnlyBuffer();
buffer.position((int)blockPos);
buffer.limit((int)limit);
extendedReadBuffers.put(buffer, clientMmap);
readStatistics.addZeroCopyBytes(length);
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("readZeroCopy read " + maxLength + " bytes from " +
"offset " + curPos + " via the zero-copy read path. " +
"blockEnd = " + blockEnd);
}
success = true;
} finally {
if (!success) {
clientMmap.unref();
}
}
return buffer;
}

View File

@ -27,29 +27,71 @@
import org.apache.hadoop.hdfs.DFSClient.Conf;
import org.apache.hadoop.net.unix.DomainSocket;
import com.google.common.base.Preconditions;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
class DomainSocketFactory {
private static final Log LOG = BlockReaderLocal.LOG;
private final Conf conf;
enum PathStatus {
UNUSABLE,
SHORT_CIRCUIT_DISABLED,
public enum PathState {
UNUSABLE(false, false),
SHORT_CIRCUIT_DISABLED(true, false),
VALID(true, true);
PathState(boolean usableForDataTransfer, boolean usableForShortCircuit) {
this.usableForDataTransfer = usableForDataTransfer;
this.usableForShortCircuit = usableForShortCircuit;
}
public boolean getUsableForDataTransfer() {
return usableForDataTransfer;
}
public boolean getUsableForShortCircuit() {
return usableForShortCircuit;
}
private final boolean usableForDataTransfer;
private final boolean usableForShortCircuit;
}
public static class PathInfo {
private final static PathInfo NOT_CONFIGURED =
new PathInfo("", PathState.UNUSABLE);
final private String path;
final private PathState state;
PathInfo(String path, PathState state) {
this.path = path;
this.state = state;
}
public String getPath() {
return path;
}
public PathState getPathState() {
return state;
}
@Override
public String toString() {
return new StringBuilder().append("PathInfo{path=").append(path).
append(", state=").append(state).append("}").toString();
}
}
/**
* Information about domain socket paths.
*/
Cache<String, PathStatus> pathInfo =
Cache<String, PathState> pathMap =
CacheBuilder.newBuilder()
.expireAfterWrite(10, TimeUnit.MINUTES)
.build();
public DomainSocketFactory(Conf conf) {
this.conf = conf;
final String feature;
if (conf.shortCircuitLocalReads && (!conf.useLegacyBlockReaderLocal)) {
feature = "The short-circuit local reads feature";
@ -75,51 +117,46 @@ public DomainSocketFactory(Conf conf) {
}
/**
* Create a DomainSocket.
*
* @param addr The address of the DataNode
* @param stream The DFSInputStream the socket will be created for.
* Get information about a domain socket path.
*
* @return null if the socket could not be created; the
* socket otherwise. If there was an error while
* creating the socket, we will add the socket path
* to our list of failed domain socket paths.
* @param addr The inet address to use.
* @param conf The client configuration.
*
* @return Information about the socket path.
*/
DomainSocket create(InetSocketAddress addr, DFSInputStream stream) {
public PathInfo getPathInfo(InetSocketAddress addr, DFSClient.Conf conf) {
// If there is no domain socket path configured, we can't use domain
// sockets.
if (conf.domainSocketPath.isEmpty()) return null;
if (conf.domainSocketPath.isEmpty()) return PathInfo.NOT_CONFIGURED;
// If we can't do anything with the domain socket, don't create it.
if (!conf.domainSocketDataTraffic &&
(!conf.shortCircuitLocalReads || conf.useLegacyBlockReaderLocal)) {
return null;
return PathInfo.NOT_CONFIGURED;
}
// UNIX domain sockets can only be used to talk to local peers
if (!DFSClient.isLocalAddress(addr)) return null;
// If the DomainSocket code is not loaded, we can't create
// DomainSocket objects.
if (DomainSocket.getLoadingFailureReason() != null) return null;
if (DomainSocket.getLoadingFailureReason() != null) {
return PathInfo.NOT_CONFIGURED;
}
// UNIX domain sockets can only be used to talk to local peers
if (!DFSClient.isLocalAddress(addr)) return PathInfo.NOT_CONFIGURED;
String escapedPath = DomainSocket.
getEffectivePath(conf.domainSocketPath, addr.getPort());
PathStatus info = pathInfo.getIfPresent(escapedPath);
if (info == PathStatus.UNUSABLE) {
// We tried to connect to this domain socket before, and it was totally
// unusable.
return null;
}
if ((!conf.domainSocketDataTraffic) &&
((info == PathStatus.SHORT_CIRCUIT_DISABLED) ||
stream.shortCircuitForbidden())) {
// If we don't want to pass data over domain sockets, and we don't want
// to pass file descriptors over them either, we have no use for domain
// sockets.
return null;
PathState status = pathMap.getIfPresent(escapedPath);
if (status == null) {
return new PathInfo(escapedPath, PathState.VALID);
} else {
return new PathInfo(escapedPath, status);
}
}
public DomainSocket createSocket(PathInfo info, int socketTimeout) {
Preconditions.checkArgument(info.getPathState() != PathState.UNUSABLE);
boolean success = false;
DomainSocket sock = null;
try {
sock = DomainSocket.connect(escapedPath);
sock.setAttribute(DomainSocket.RECEIVE_TIMEOUT, conf.socketTimeout);
sock = DomainSocket.connect(info.getPath());
sock.setAttribute(DomainSocket.RECEIVE_TIMEOUT, socketTimeout);
success = true;
} catch (IOException e) {
LOG.warn("error creating DomainSocket", e);
@ -129,7 +166,7 @@ DomainSocket create(InetSocketAddress addr, DFSInputStream stream) {
if (sock != null) {
IOUtils.closeQuietly(sock);
}
pathInfo.put(escapedPath, PathStatus.UNUSABLE);
pathMap.put(info.getPath(), PathState.UNUSABLE);
sock = null;
}
}
@ -137,10 +174,10 @@ DomainSocket create(InetSocketAddress addr, DFSInputStream stream) {
}
public void disableShortCircuitForPath(String path) {
pathInfo.put(path, PathStatus.SHORT_CIRCUIT_DISABLED);
pathMap.put(path, PathState.SHORT_CIRCUIT_DISABLED);
}
public void disableDomainSocketPath(String path) {
pathInfo.put(path, PathStatus.UNUSABLE);
pathMap.put(path, PathState.UNUSABLE);
}
}

View File

@ -0,0 +1,75 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import org.apache.commons.lang.builder.EqualsBuilder;
import org.apache.commons.lang.builder.HashCodeBuilder;
/**
* An immutable key which identifies a block.
*/
final public class ExtendedBlockId {
/**
* The block ID for this block.
*/
private final long blockId;
/**
* The block pool ID for this block.
*/
private final String bpId;
public ExtendedBlockId(long blockId, String bpId) {
this.blockId = blockId;
this.bpId = bpId;
}
public long getBlockId() {
return this.blockId;
}
public String getBlockPoolId() {
return this.bpId;
}
@Override
public boolean equals(Object o) {
if ((o == null) || (o.getClass() != this.getClass())) {
return false;
}
ExtendedBlockId other = (ExtendedBlockId)o;
return new EqualsBuilder().
append(blockId, other.blockId).
append(bpId, other.bpId).
isEquals();
}
@Override
public int hashCode() {
return new HashCodeBuilder().
append(this.blockId).
append(this.bpId).
toHashCode();
}
@Override
public String toString() {
return new StringBuilder().append(blockId).
append("_").append(bpId).toString();
}
}

View File

@ -1,287 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.Closeable;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Time;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* FileInputStream cache is used to cache FileInputStream objects that we
* have received from the DataNode.
*/
class FileInputStreamCache {
private final static Log LOG = LogFactory.getLog(FileInputStreamCache.class);
/**
* The executor service that runs the cacheCleaner. There is only one of
* these per VM.
*/
private final static ScheduledThreadPoolExecutor executor
= new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
setDaemon(true).setNameFormat("FileInputStreamCache Cleaner").
build());
/**
* The CacheCleaner for this FileInputStreamCache. We don't create this
* and schedule it until it becomes necessary.
*/
private CacheCleaner cacheCleaner;
/**
* Maximum number of entries to allow in the cache.
*/
private final int maxCacheSize;
/**
* The minimum time in milliseconds to preserve an element in the cache.
*/
private final long expiryTimeMs;
/**
* True if the FileInputStreamCache is closed.
*/
private boolean closed = false;
/**
* Cache entries.
*/
private final LinkedListMultimap<Key, Value> map = LinkedListMultimap.create();
/**
* Expiry thread which makes sure that the file descriptors get closed
* after a while.
*/
private static class CacheCleaner implements Runnable, Closeable {
private WeakReference<FileInputStreamCache> cacheRef;
private ScheduledFuture<?> future;
CacheCleaner(FileInputStreamCache cache) {
this.cacheRef = new WeakReference<FileInputStreamCache>(cache);
}
@Override
public void run() {
FileInputStreamCache cache = cacheRef.get();
if (cache == null) return;
synchronized(cache) {
if (cache.closed) return;
long curTime = Time.monotonicNow();
for (Iterator<Entry<Key, Value>> iter =
cache.map.entries().iterator(); iter.hasNext();
iter = cache.map.entries().iterator()) {
Entry<Key, Value> entry = iter.next();
if (entry.getValue().getTime() + cache.expiryTimeMs >= curTime) {
break;
}
entry.getValue().close();
iter.remove();
}
}
}
@Override
public void close() throws IOException {
if (future != null) {
future.cancel(false);
}
}
public void setFuture(ScheduledFuture<?> future) {
this.future = future;
}
}
/**
* The key identifying a FileInputStream array.
*/
static class Key {
private final DatanodeID datanodeID;
private final ExtendedBlock block;
public Key(DatanodeID datanodeID, ExtendedBlock block) {
this.datanodeID = datanodeID;
this.block = block;
}
@Override
public boolean equals(Object other) {
if (!(other instanceof FileInputStreamCache.Key)) {
return false;
}
FileInputStreamCache.Key otherKey = (FileInputStreamCache.Key)other;
return (block.equals(otherKey.block) &&
(block.getGenerationStamp() == otherKey.block.getGenerationStamp()) &&
datanodeID.equals(otherKey.datanodeID));
}
@Override
public int hashCode() {
return block.hashCode();
}
}
/**
* The value containing a FileInputStream array and the time it was added to
* the cache.
*/
static class Value {
private final FileInputStream fis[];
private final long time;
public Value (FileInputStream fis[]) {
this.fis = fis;
this.time = Time.monotonicNow();
}
public FileInputStream[] getFileInputStreams() {
return fis;
}
public long getTime() {
return time;
}
public void close() {
IOUtils.cleanup(LOG, fis);
}
}
/**
* Create a new FileInputStream
*
* @param maxCacheSize The maximum number of elements to allow in
* the cache.
* @param expiryTimeMs The minimum time in milliseconds to preserve
* elements in the cache.
*/
public FileInputStreamCache(int maxCacheSize, long expiryTimeMs) {
this.maxCacheSize = maxCacheSize;
this.expiryTimeMs = expiryTimeMs;
}
/**
* Put an array of FileInputStream objects into the cache.
*
* @param datanodeID The DatanodeID to store the streams under.
* @param block The Block to store the streams under.
* @param fis The streams.
*/
public void put(DatanodeID datanodeID, ExtendedBlock block,
FileInputStream fis[]) {
boolean inserted = false;
try {
synchronized(this) {
if (closed) return;
if (map.size() + 1 > maxCacheSize) {
Iterator<Entry<Key, Value>> iter = map.entries().iterator();
if (!iter.hasNext()) return;
Entry<Key, Value> entry = iter.next();
entry.getValue().close();
iter.remove();
}
if (cacheCleaner == null) {
cacheCleaner = new CacheCleaner(this);
ScheduledFuture<?> future =
executor.scheduleAtFixedRate(cacheCleaner, expiryTimeMs, expiryTimeMs,
TimeUnit.MILLISECONDS);
cacheCleaner.setFuture(future);
}
map.put(new Key(datanodeID, block), new Value(fis));
inserted = true;
}
} finally {
if (!inserted) {
IOUtils.cleanup(LOG, fis);
}
}
}
/**
* Find and remove an array of FileInputStream objects from the cache.
*
* @param datanodeID The DatanodeID to search for.
* @param block The Block to search for.
*
* @return null if no streams can be found; the
* array otherwise. If this is non-null, the
* array will have been removed from the cache.
*/
public synchronized FileInputStream[] get(DatanodeID datanodeID,
ExtendedBlock block) {
Key key = new Key(datanodeID, block);
List<Value> ret = map.get(key);
if (ret.isEmpty()) return null;
Value val = ret.get(0);
map.remove(key, val);
return val.getFileInputStreams();
}
/**
* Close the cache and free all associated resources.
*/
public synchronized void close() {
if (closed) return;
closed = true;
IOUtils.cleanup(LOG, cacheCleaner);
for (Iterator<Entry<Key, Value>> iter = map.entries().iterator();
iter.hasNext();) {
Entry<Key, Value> entry = iter.next();
entry.getValue().close();
iter.remove();
}
}
public synchronized String toString() {
StringBuilder bld = new StringBuilder();
bld.append("FileInputStreamCache(");
String prefix = "";
for (Entry<Key, Value> entry : map.entries()) {
bld.append(prefix);
bld.append(entry.getKey());
prefix = ", ";
}
bld.append(")");
return bld.toString();
}
public long getExpiryTimeMs() {
return expiryTimeMs;
}
public int getMaxCacheSize() {
return maxCacheSize;
}
}

View File

@ -89,42 +89,19 @@ long getTime() {
LinkedListMultimap.create();
private final int capacity;
private final long expiryPeriod;
private static PeerCache instance = null;
@VisibleForTesting
PeerCache(int c, long e) {
public PeerCache(int c, long e) {
this.capacity = c;
this.expiryPeriod = e;
if (capacity == 0 ) {
LOG.info("SocketCache disabled.");
}
else if (expiryPeriod == 0) {
} else if (expiryPeriod == 0) {
throw new IllegalStateException("Cannot initialize expiryPeriod to " +
expiryPeriod + "when cache is enabled.");
expiryPeriod + " when cache is enabled.");
}
}
public static synchronized PeerCache getInstance(int c, long e) {
// capacity is only initialized once
if (instance == null) {
instance = new PeerCache(c, e);
} else { //already initialized once
if (instance.capacity != c || instance.expiryPeriod != e) {
LOG.info("capacity and expiry periods already set to " +
instance.capacity + " and " + instance.expiryPeriod +
" respectively. Cannot set it to " + c + " and " + e);
}
}
return instance;
}
@VisibleForTesting
public static synchronized void setInstance(int c, long e) {
instance = new PeerCache(c, e);
}
private boolean isDaemonStarted() {
return (daemon == null)? false: true;
}

View File

@ -30,7 +30,6 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -492,8 +491,7 @@ public boolean isShortCircuit() {
}
@Override
public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager) {
public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
return null;
}
}

View File

@ -32,7 +32,6 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -457,8 +456,7 @@ public boolean isShortCircuit() {
}
@Override
public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager) {
public ClientMmap getClientMmap(EnumSet<ReadOption> opts) {
return null;
}
}

View File

@ -0,0 +1,37 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.IOException;
import java.net.InetSocketAddress;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.security.UserGroupInformation;
public interface RemotePeerFactory {
/**
* @param addr The address to connect to.
*
* @return A new Peer connected to the address.
*
* @throws IOException If there was an error connecting or creating
* the remote socket, encrypted stream, etc.
*/
Peer newConnectedPeer(InetSocketAddress addr) throws IOException;
}

View File

@ -17,24 +17,14 @@
*/
package org.apache.hadoop.hdfs.client;
import java.io.FileInputStream;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.nativeio.NativeIO;
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.annotations.VisibleForTesting;
/**
* A memory-mapped region used by an HDFS client.
*
@ -46,111 +36,46 @@ public class ClientMmap {
static final Log LOG = LogFactory.getLog(ClientMmap.class);
/**
* A reference to the manager of this mmap.
*
* This is only a weak reference to help minimize the damange done by
* code which leaks references accidentally.
* A reference to the block replica which this mmap relates to.
*/
private final WeakReference<ClientMmapManager> manager;
private final ShortCircuitReplica replica;
/**
* The actual mapped memory region.
* The java ByteBuffer object.
*/
private final MappedByteBuffer map;
/**
* A reference count tracking how many threads are using this object.
* Reference count of this ClientMmap object.
*/
private final AtomicInteger refCount = new AtomicInteger(1);
/**
* Block pertaining to this mmap
*/
private final ExtendedBlock block;
/**
* The DataNode where this mmap came from.
*/
private final DatanodeID datanodeID;
/**
* The monotonic time when this mmap was last evictable.
*/
private long lastEvictableTimeNs;
public static ClientMmap load(ClientMmapManager manager, FileInputStream in,
ExtendedBlock block, DatanodeID datanodeID)
throws IOException {
MappedByteBuffer map =
in.getChannel().map(MapMode.READ_ONLY, 0,
in.getChannel().size());
return new ClientMmap(manager, map, block, datanodeID);
}
private ClientMmap(ClientMmapManager manager, MappedByteBuffer map,
ExtendedBlock block, DatanodeID datanodeID)
throws IOException {
this.manager = new WeakReference<ClientMmapManager>(manager);
ClientMmap(ShortCircuitReplica replica, MappedByteBuffer map) {
this.replica = replica;
this.map = map;
this.block = block;
this.datanodeID = datanodeID;
this.lastEvictableTimeNs = 0;
}
/**
* Decrement the reference count on this object.
* Should be called with the ClientMmapManager lock held.
* Increment the reference count.
*
* @return The new reference count.
*/
void ref() {
refCount.addAndGet(1);
}
/**
* Decrement the reference count.
*
* The parent replica gets unreferenced each time the reference count
* of this object goes to 0.
*/
public void unref() {
int count = refCount.decrementAndGet();
if (count < 0) {
throw new IllegalArgumentException("can't decrement the " +
"reference count on this ClientMmap lower than 0.");
} else if (count == 0) {
ClientMmapManager man = manager.get();
if (man == null) {
unmap();
} else {
man.makeEvictable(this);
}
}
}
/**
* Increment the reference count on this object.
*
* @return The new reference count.
*/
public int ref() {
return refCount.getAndIncrement();
}
@VisibleForTesting
public ExtendedBlock getBlock() {
return block;
}
DatanodeID getDatanodeID() {
return datanodeID;
refCount.addAndGet(-1);
replica.unref();
}
public MappedByteBuffer getMappedByteBuffer() {
return map;
}
public void setLastEvictableTimeNs(long lastEvictableTimeNs) {
this.lastEvictableTimeNs = lastEvictableTimeNs;
}
public long getLastEvictableTimeNs() {
return this.lastEvictableTimeNs;
}
/**
* Unmap the memory region.
*/
void unmap() {
assert(refCount.get() == 0);
NativeIO.POSIX.munmap(map);
}
}
}

View File

@ -1,482 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.client;
import java.io.Closeable;
import org.apache.hadoop.classification.InterfaceAudience;
import java.io.FileInputStream;
import java.io.IOException;
import java.lang.ref.WeakReference;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.IOUtils;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ComparisonChain;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* Tracks mmap instances used on an HDFS client.
*
* mmaps can be used concurrently by multiple threads at once.
* mmaps cannot be closed while they are in use.
*
* The cache is important for performance, because the first time an mmap is
* created, the page table entries (PTEs) are not yet set up.
* Even when reading data that is entirely resident in memory, reading an
* mmap the second time is faster.
*/
@InterfaceAudience.Private
public class ClientMmapManager implements Closeable {
public static final Log LOG = LogFactory.getLog(ClientMmapManager.class);
private boolean closed = false;
private final int cacheSize;
private final long timeoutNs;
private final int runsPerTimeout;
private final Lock lock = new ReentrantLock();
/**
* Maps block, datanode_id to the client mmap object.
* If the ClientMmap is in the process of being loaded,
* {@link Waitable<ClientMmap>#await()} will block.
*
* Protected by the ClientMmapManager lock.
*/
private final TreeMap<Key, Waitable<ClientMmap>> mmaps =
new TreeMap<Key, Waitable<ClientMmap>>();
/**
* Maps the last use time to the client mmap object.
* We ensure that each last use time is unique by inserting a jitter of a
* nanosecond or two if necessary.
*
* Protected by the ClientMmapManager lock.
* ClientMmap objects that are in use are never evictable.
*/
private final TreeMap<Long, ClientMmap> evictable =
new TreeMap<Long, ClientMmap>();
private final ScheduledThreadPoolExecutor executor =
new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
setDaemon(true).setNameFormat("ClientMmapManager").
build());
/**
* The CacheCleaner for this ClientMmapManager. We don't create this
* and schedule it until it becomes necessary.
*/
private CacheCleaner cacheCleaner;
/**
* Factory method to create a ClientMmapManager from a Hadoop
* configuration.
*/
public static ClientMmapManager fromConf(Configuration conf) {
return new ClientMmapManager(conf.getInt(DFS_CLIENT_MMAP_CACHE_SIZE,
DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
conf.getLong(DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
conf.getInt(DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT,
DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT));
}
public ClientMmapManager(int cacheSize, long timeoutMs, int runsPerTimeout) {
this.cacheSize = cacheSize;
this.timeoutNs = timeoutMs * 1000000;
this.runsPerTimeout = runsPerTimeout;
}
long getTimeoutMs() {
return this.timeoutNs / 1000000;
}
int getRunsPerTimeout() {
return this.runsPerTimeout;
}
public String verifyConfigurationMatches(Configuration conf) {
StringBuilder bld = new StringBuilder();
int cacheSize = conf.getInt(DFS_CLIENT_MMAP_CACHE_SIZE,
DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT);
if (this.cacheSize != cacheSize) {
bld.append("You specified a cache size of ").append(cacheSize).
append(", but the existing cache size is ").append(this.cacheSize).
append(". ");
}
long timeoutMs = conf.getLong(DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT);
if (getTimeoutMs() != timeoutMs) {
bld.append("You specified a cache timeout of ").append(timeoutMs).
append(" ms, but the existing cache timeout is ").
append(getTimeoutMs()).append("ms").append(". ");
}
int runsPerTimeout = conf.getInt(
DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT,
DFS_CLIENT_MMAP_CACHE_THREAD_RUNS_PER_TIMEOUT_DEFAULT);
if (getRunsPerTimeout() != runsPerTimeout) {
bld.append("You specified ").append(runsPerTimeout).
append(" runs per timeout, but the existing runs per timeout is ").
append(getTimeoutMs()).append(". ");
}
return bld.toString();
}
private static class Waitable<T> {
private T val;
private final Condition cond;
public Waitable(Condition cond) {
this.val = null;
this.cond = cond;
}
public T await() throws InterruptedException {
while (this.val == null) {
this.cond.await();
}
return this.val;
}
public void provide(T val) {
this.val = val;
this.cond.signalAll();
}
}
private static class Key implements Comparable<Key> {
private final ExtendedBlock block;
private final DatanodeID datanode;
Key(ExtendedBlock block, DatanodeID datanode) {
this.block = block;
this.datanode = datanode;
}
/**
* Compare two ClientMmap regions that we're storing.
*
* When we append to a block, we bump the genstamp. It is important to
* compare the genStamp here. That way, we will not return a shorter
* mmap than required.
*/
@Override
public int compareTo(Key o) {
return ComparisonChain.start().
compare(block.getBlockId(), o.block.getBlockId()).
compare(block.getGenerationStamp(), o.block.getGenerationStamp()).
compare(block.getBlockPoolId(), o.block.getBlockPoolId()).
compare(datanode, o.datanode).
result();
}
@Override
public boolean equals(Object rhs) {
if (rhs == null) {
return false;
}
try {
Key o = (Key)rhs;
return (compareTo(o) == 0);
} catch (ClassCastException e) {
return false;
}
}
@Override
public int hashCode() {
return block.hashCode() ^ datanode.hashCode();
}
}
/**
* Thread which handles expiring mmaps from the cache.
*/
private static class CacheCleaner implements Runnable, Closeable {
private WeakReference<ClientMmapManager> managerRef;
private ScheduledFuture<?> future;
CacheCleaner(ClientMmapManager manager) {
this.managerRef= new WeakReference<ClientMmapManager>(manager);
}
@Override
public void run() {
ClientMmapManager manager = managerRef.get();
if (manager == null) return;
long curTime = System.nanoTime();
try {
manager.lock.lock();
manager.evictStaleEntries(curTime);
} finally {
manager.lock.unlock();
}
}
void setFuture(ScheduledFuture<?> future) {
this.future = future;
}
@Override
public void close() throws IOException {
future.cancel(false);
}
}
/**
* Evict entries which are older than curTime + timeoutNs from the cache.
*
* NOTE: you must call this function with the lock held.
*/
private void evictStaleEntries(long curTime) {
if (closed) {
return;
}
Iterator<Entry<Long, ClientMmap>> iter =
evictable.entrySet().iterator();
while (iter.hasNext()) {
Entry<Long, ClientMmap> entry = iter.next();
if (entry.getKey() + timeoutNs >= curTime) {
return;
}
ClientMmap mmap = entry.getValue();
Key key = new Key(mmap.getBlock(), mmap.getDatanodeID());
mmaps.remove(key);
iter.remove();
mmap.unmap();
}
}
/**
* Evict one mmap object from the cache.
*
* NOTE: you must call this function with the lock held.
*
* @return True if an object was evicted; false if none
* could be evicted.
*/
private boolean evictOne() {
Entry<Long, ClientMmap> entry = evictable.pollFirstEntry();
if (entry == null) {
// We don't want to try creating another mmap region, because the
// cache is full.
return false;
}
ClientMmap evictedMmap = entry.getValue();
Key evictedKey = new Key(evictedMmap.getBlock(),
evictedMmap.getDatanodeID());
mmaps.remove(evictedKey);
evictedMmap.unmap();
return true;
}
/**
* Create a new mmap object.
*
* NOTE: you must call this function with the lock held.
*
* @param key The key which describes this mmap.
* @param in The input stream to use to create the mmap.
* @return The new mmap object, or null if there were
* insufficient resources.
* @throws IOException If there was an I/O error creating the mmap.
*/
private ClientMmap create(Key key, FileInputStream in) throws IOException {
if (mmaps.size() + 1 > cacheSize) {
if (!evictOne()) {
LOG.warn("mmap cache is full (with " + cacheSize + " elements) and " +
"nothing is evictable. Ignoring request for mmap with " +
"datanodeID=" + key.datanode + ", " + "block=" + key.block);
return null;
}
}
// Create the condition variable that other threads may wait on.
Waitable<ClientMmap> waitable =
new Waitable<ClientMmap>(lock.newCondition());
mmaps.put(key, waitable);
// Load the entry
boolean success = false;
ClientMmap mmap = null;
try {
try {
lock.unlock();
mmap = ClientMmap.load(this, in, key.block, key.datanode);
} finally {
lock.lock();
}
if (cacheCleaner == null) {
cacheCleaner = new CacheCleaner(this);
ScheduledFuture<?> future =
executor.scheduleAtFixedRate(cacheCleaner,
timeoutNs, timeoutNs / runsPerTimeout, TimeUnit.NANOSECONDS);
cacheCleaner.setFuture(future);
}
success = true;
} finally {
if (!success) {
LOG.warn("failed to create mmap for datanodeID=" + key.datanode +
", " + "block=" + key.block);
mmaps.remove(key);
}
waitable.provide(mmap);
}
if (LOG.isDebugEnabled()) {
LOG.info("created a new ClientMmap for block " + key.block +
" on datanode " + key.datanode);
}
return mmap;
}
/**
* Get or create an mmap region.
*
* @param node The DataNode that owns the block for this mmap region.
* @param block The block ID, block pool ID, and generation stamp of
* the block we want to read.
* @param in An open file for this block. This stream is only used
* if we have to create a new mmap; if we use an
* existing one, it is ignored.
*
* @return The client mmap region.
*/
public ClientMmap fetch(DatanodeID datanodeID, ExtendedBlock block,
FileInputStream in) throws IOException, InterruptedException {
LOG.debug("fetching mmap with datanodeID=" + datanodeID + ", " +
"block=" + block);
Key key = new Key(block, datanodeID);
ClientMmap mmap = null;
try {
lock.lock();
if (closed) {
throw new IOException("ClientMmapManager is closed.");
}
while (mmap == null) {
Waitable<ClientMmap> entry = mmaps.get(key);
if (entry == null) {
return create(key, in);
}
mmap = entry.await();
}
if (mmap.ref() == 1) {
// When going from nobody using the mmap (ref = 0) to somebody
// using the mmap (ref = 1), we must make the mmap un-evictable.
evictable.remove(mmap.getLastEvictableTimeNs());
}
}
finally {
lock.unlock();
}
if (LOG.isDebugEnabled()) {
LOG.debug("reusing existing mmap with datanodeID=" + datanodeID +
", " + "block=" + block);
}
return mmap;
}
/**
* Make an mmap evictable.
*
* When an mmap is evictable, it may be removed from the cache if necessary.
* mmaps can only be evictable if nobody is using them.
*
* @param mmap The mmap to make evictable.
*/
void makeEvictable(ClientMmap mmap) {
try {
lock.lock();
if (closed) {
// If this ClientMmapManager is closed, then don't bother with the
// cache; just close the mmap.
mmap.unmap();
return;
}
long now = System.nanoTime();
while (evictable.containsKey(now)) {
now++;
}
mmap.setLastEvictableTimeNs(now);
evictable.put(now, mmap);
} finally {
lock.unlock();
}
}
@Override
public void close() throws IOException {
try {
lock.lock();
closed = true;
IOUtils.cleanup(LOG, cacheCleaner);
// Unmap all the mmaps that nobody is using.
// The ones which are in use will be unmapped just as soon as people stop
// using them.
evictStaleEntries(Long.MAX_VALUE);
executor.shutdown();
} finally {
lock.unlock();
}
}
@VisibleForTesting
public interface ClientMmapVisitor {
void accept(ClientMmap mmap);
}
@VisibleForTesting
public synchronized void visitMmaps(ClientMmapVisitor visitor)
throws InterruptedException {
for (Waitable<ClientMmap> entry : mmaps.values()) {
visitor.accept(entry.await());
}
}
public void visitEvictable(ClientMmapVisitor visitor)
throws InterruptedException {
for (ClientMmap mmap : evictable.values()) {
visitor.accept(mmap);
}
}
}

View File

@ -0,0 +1,881 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.client;
import java.io.Closeable;
import org.apache.hadoop.classification.InterfaceAudience;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.ExtendedBlockId;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RetriableException;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.Waitable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* The ShortCircuitCache tracks things which the client needs to access
* HDFS block files via short-circuit.
*
* These things include: memory-mapped regions, file descriptors, and shared
* memory areas for communicating with the DataNode.
*/
@InterfaceAudience.Private
public class ShortCircuitCache implements Closeable {
public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
/**
* Expiry thread which makes sure that the file descriptors get closed
* after a while.
*/
private class CacheCleaner implements Runnable, Closeable {
private ScheduledFuture<?> future;
/**
* Run the CacheCleaner thread.
*
* Whenever a thread requests a ShortCircuitReplica object, we will make
* sure it gets one. That ShortCircuitReplica object can then be re-used
* when another thread requests a ShortCircuitReplica object for the same
* block. So in that sense, there is no maximum size to the cache.
*
* However, when a ShortCircuitReplica object is unreferenced by the
* thread(s) that are using it, it becomes evictable. There are two
* separate eviction lists-- one for mmaped objects, and another for
* non-mmaped objects. We do this in order to avoid having the regular
* files kick the mmaped files out of the cache too quickly. Reusing
* an already-existing mmap gives a huge performance boost, since the
* page table entries don't have to be re-populated. Both the mmap
* and non-mmap evictable lists have maximum sizes and maximum lifespans.
*/
@Override
public void run() {
ShortCircuitCache.this.lock.lock();
try {
if (ShortCircuitCache.this.closed) return;
long curMs = Time.monotonicNow();
if (LOG.isDebugEnabled()) {
LOG.debug(this + ": cache cleaner running at " + curMs);
}
int numDemoted = demoteOldEvictableMmaped(curMs);
int numPurged = 0;
Long evictionTimeNs = Long.valueOf(0);
while (true) {
Entry<Long, ShortCircuitReplica> entry =
evictableMmapped.ceilingEntry(evictionTimeNs);
if (entry == null) break;
evictionTimeNs = entry.getKey();
long evictionTimeMs =
TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
if (evictionTimeMs + maxNonMmappedEvictableLifespanMs >= curMs) break;
ShortCircuitReplica replica = entry.getValue();
if (LOG.isTraceEnabled()) {
LOG.trace("CacheCleaner: purging " + replica + ": " +
StringUtils.getStackTrace(Thread.currentThread()));
}
purge(replica);
numPurged++;
}
if (LOG.isDebugEnabled()) {
LOG.debug(this + ": finishing cache cleaner run started at " +
curMs + ". Demoted " + numDemoted + " mmapped replicas; " +
"purged " + numPurged + " replicas.");
}
} finally {
ShortCircuitCache.this.lock.unlock();
}
}
@Override
public void close() throws IOException {
if (future != null) {
future.cancel(false);
}
}
public void setFuture(ScheduledFuture<?> future) {
this.future = future;
}
/**
* Get the rate at which this cleaner thread should be scheduled.
*
* We do this by taking the minimum expiration time and dividing by 4.
*
* @return the rate in milliseconds at which this thread should be
* scheduled.
*/
public long getRateInMs() {
long minLifespanMs =
Math.min(maxNonMmappedEvictableLifespanMs,
maxEvictableMmapedLifespanMs);
long sampleTimeMs = minLifespanMs / 4;
return (sampleTimeMs < 1) ? 1 : sampleTimeMs;
}
}
public interface ShortCircuitReplicaCreator {
/**
* Attempt to create a ShortCircuitReplica object.
*
* This callback will be made without holding any locks.
*
* @return a non-null ShortCircuitReplicaInfo object.
*/
ShortCircuitReplicaInfo createShortCircuitReplicaInfo();
}
/**
* Lock protecting the cache.
*/
private final ReentrantLock lock = new ReentrantLock();
/**
* The executor service that runs the cacheCleaner.
*/
private final ScheduledThreadPoolExecutor executor
= new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
setDaemon(true).setNameFormat("ShortCircuitCache Cleaner").
build());
/**
* A map containing all ShortCircuitReplicaInfo objects, organized by Key.
* ShortCircuitReplicaInfo objects may contain a replica, or an InvalidToken
* exception.
*/
private final HashMap<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>>
replicaInfoMap = new HashMap<ExtendedBlockId,
Waitable<ShortCircuitReplicaInfo>>();
/**
* The CacheCleaner. We don't create this and schedule it until it becomes
* necessary.
*/
private CacheCleaner cacheCleaner;
/**
* Tree of evictable elements.
*
* Maps (unique) insertion time in nanoseconds to the element.
*/
private final TreeMap<Long, ShortCircuitReplica> evictable =
new TreeMap<Long, ShortCircuitReplica>();
/**
* Maximum total size of the cache, including both mmapped and
* no$-mmapped elements.
*/
private int maxTotalSize;
/**
* Non-mmaped elements older than this will be closed.
*/
private long maxNonMmappedEvictableLifespanMs;
/**
* Tree of mmaped evictable elements.
*
* Maps (unique) insertion time in nanoseconds to the element.
*/
private final TreeMap<Long, ShortCircuitReplica> evictableMmapped =
new TreeMap<Long, ShortCircuitReplica>();
/**
* Maximum number of mmaped evictable elements.
*/
private int maxEvictableMmapedSize;
/**
* Mmaped elements older than this will be closed.
*/
private final long maxEvictableMmapedLifespanMs;
/**
* The minimum number of milliseconds we'll wait after an unsuccessful
* mmap attempt before trying again.
*/
private final long mmapRetryTimeoutMs;
/**
* How long we will keep replicas in the cache before declaring them
* to be stale.
*/
private final long staleThresholdMs;
/**
* True if the ShortCircuitCache is closed.
*/
private boolean closed = false;
/**
* Number of existing mmaps associated with this cache.
*/
private int outstandingMmapCount = 0;
/**
* Create a {@link ShortCircuitCache} object from a {@link Configuration}
*/
public static ShortCircuitCache fromConf(Configuration conf) {
return new ShortCircuitCache(
conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
conf.getLong(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT),
conf.getInt(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE,
DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_SIZE_DEFAULT),
conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS,
DFSConfigKeys.DFS_CLIENT_MMAP_CACHE_TIMEOUT_MS_DEFAULT),
conf.getLong(DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS,
DFSConfigKeys.DFS_CLIENT_MMAP_RETRY_TIMEOUT_MS_DEFAULT),
conf.getLong(DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS,
DFSConfigKeys.DFS_CLIENT_SHORT_CIRCUIT_REPLICA_STALE_THRESHOLD_MS_DEFAULT));
}
public ShortCircuitCache(int maxTotalSize, long maxNonMmappedEvictableLifespanMs,
int maxEvictableMmapedSize, long maxEvictableMmapedLifespanMs,
long mmapRetryTimeoutMs, long staleThresholdMs) {
Preconditions.checkArgument(maxTotalSize >= 0);
this.maxTotalSize = maxTotalSize;
Preconditions.checkArgument(maxNonMmappedEvictableLifespanMs >= 0);
this.maxNonMmappedEvictableLifespanMs = maxNonMmappedEvictableLifespanMs;
Preconditions.checkArgument(maxEvictableMmapedSize >= 0);
this.maxEvictableMmapedSize = maxEvictableMmapedSize;
Preconditions.checkArgument(maxEvictableMmapedLifespanMs >= 0);
this.maxEvictableMmapedLifespanMs = maxEvictableMmapedLifespanMs;
this.mmapRetryTimeoutMs = mmapRetryTimeoutMs;
this.staleThresholdMs = staleThresholdMs;
}
public long getMmapRetryTimeoutMs() {
return mmapRetryTimeoutMs;
}
public long getStaleThresholdMs() {
return staleThresholdMs;
}
/**
* Increment the reference count of a replica, and remove it from any free
* list it may be in.
*
* You must hold the cache lock while calling this function.
*
* @param replica The replica we're removing.
*/
private void ref(ShortCircuitReplica replica) {
lock.lock();
try {
Preconditions.checkArgument(replica.refCount > 0,
"can't ref " + replica + " because its refCount reached " +
replica.refCount);
Long evictableTimeNs = replica.getEvictableTimeNs();
replica.refCount++;
if (evictableTimeNs != null) {
String removedFrom = removeEvictable(replica);
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": " + removedFrom +
" no longer contains " + replica + ". refCount " +
(replica.refCount - 1) + " -> " + replica.refCount +
StringUtils.getStackTrace(Thread.currentThread()));
}
} else if (LOG.isTraceEnabled()) {
LOG.trace(this + ": replica refCount " +
(replica.refCount - 1) + " -> " + replica.refCount +
StringUtils.getStackTrace(Thread.currentThread()));
}
} finally {
lock.unlock();
}
}
/**
* Unreference a replica.
*
* You must hold the cache lock while calling this function.
*
* @param replica The replica being unreferenced.
*/
void unref(ShortCircuitReplica replica) {
lock.lock();
try {
String addedString = "";
int newRefCount = --replica.refCount;
if (newRefCount == 0) {
// Close replica, since there are no remaining references to it.
Preconditions.checkArgument(replica.purged,
"Replica " + replica + " reached a refCount of 0 without " +
"being purged");
replica.close();
} else if (newRefCount == 1) {
Preconditions.checkState(null == replica.getEvictableTimeNs(),
"Replica " + replica + " had a refCount higher than 1, " +
"but was still evictable (evictableTimeNs = " +
replica.getEvictableTimeNs() + ")");
if (!replica.purged) {
// Add the replica to the end of an eviction list.
// Eviction lists are sorted by time.
if (replica.hasMmap()) {
insertEvictable(System.nanoTime(), replica, evictableMmapped);
addedString = "added to evictableMmapped, ";
} else {
insertEvictable(System.nanoTime(), replica, evictable);
addedString = "added to evictable, ";
}
trimEvictionMaps();
}
} else {
Preconditions.checkArgument(replica.refCount >= 0,
"replica's refCount went negative (refCount = " +
replica.refCount + " for " + replica + ")");
}
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": unref replica " + replica +
": " + addedString + " refCount " +
(newRefCount + 1) + " -> " + newRefCount +
StringUtils.getStackTrace(Thread.currentThread()));
}
} finally {
lock.unlock();
}
}
/**
* Demote old evictable mmaps into the regular eviction map.
*
* You must hold the cache lock while calling this function.
*
* @param now Current time in monotonic milliseconds.
* @return Number of replicas demoted.
*/
private int demoteOldEvictableMmaped(long now) {
int numDemoted = 0;
boolean needMoreSpace = false;
Long evictionTimeNs = Long.valueOf(0);
while (true) {
Entry<Long, ShortCircuitReplica> entry =
evictableMmapped.ceilingEntry(evictionTimeNs);
if (entry == null) break;
evictionTimeNs = entry.getKey();
long evictionTimeMs =
TimeUnit.MILLISECONDS.convert(evictionTimeNs, TimeUnit.NANOSECONDS);
if (evictionTimeMs + maxEvictableMmapedLifespanMs >= now) {
if (evictableMmapped.size() < maxEvictableMmapedSize) {
break;
}
needMoreSpace = true;
}
ShortCircuitReplica replica = entry.getValue();
if (LOG.isTraceEnabled()) {
String rationale = needMoreSpace ? "because we need more space" :
"because it's too old";
LOG.trace("demoteOldEvictable: demoting " + replica + ": " +
rationale + ": " +
StringUtils.getStackTrace(Thread.currentThread()));
}
removeEvictable(replica, evictableMmapped);
munmap(replica);
insertEvictable(evictionTimeNs, replica, evictable);
numDemoted++;
}
return numDemoted;
}
/**
* Trim the eviction lists.
*/
private void trimEvictionMaps() {
long now = Time.monotonicNow();
demoteOldEvictableMmaped(now);
while (true) {
long evictableSize = evictable.size();
long evictableMmappedSize = evictableMmapped.size();
if (evictableSize + evictableMmappedSize <= maxTotalSize) {
return;
}
ShortCircuitReplica replica;
if (evictableSize == 0) {
replica = evictableMmapped.firstEntry().getValue();
} else {
replica = evictable.firstEntry().getValue();
}
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": trimEvictionMaps is purging " +
StringUtils.getStackTrace(Thread.currentThread()));
}
purge(replica);
}
}
/**
* Munmap a replica, updating outstandingMmapCount.
*
* @param replica The replica to munmap.
*/
private void munmap(ShortCircuitReplica replica) {
replica.munmap();
outstandingMmapCount--;
}
/**
* Remove a replica from an evictable map.
*
* @param replica The replica to remove.
* @return The map it was removed from.
*/
private String removeEvictable(ShortCircuitReplica replica) {
if (replica.hasMmap()) {
removeEvictable(replica, evictableMmapped);
return "evictableMmapped";
} else {
removeEvictable(replica, evictable);
return "evictable";
}
}
/**
* Remove a replica from an evictable map.
*
* @param replica The replica to remove.
* @param map The map to remove it from.
*/
private void removeEvictable(ShortCircuitReplica replica,
TreeMap<Long, ShortCircuitReplica> map) {
Long evictableTimeNs = replica.getEvictableTimeNs();
Preconditions.checkNotNull(evictableTimeNs);
ShortCircuitReplica removed = map.remove(evictableTimeNs);
Preconditions.checkState(removed == replica,
"failed to make " + replica + " unevictable");
replica.setEvictableTimeNs(null);
}
/**
* Insert a replica into an evictable map.
*
* If an element already exists with this eviction time, we add a nanosecond
* to it until we find an unused key.
*
* @param evictionTimeNs The eviction time in absolute nanoseconds.
* @param replica The replica to insert.
* @param map The map to insert it into.
*/
private void insertEvictable(Long evictionTimeNs,
ShortCircuitReplica replica, TreeMap<Long, ShortCircuitReplica> map) {
while (map.containsKey(evictionTimeNs)) {
evictionTimeNs++;
}
Preconditions.checkState(null == replica.getEvictableTimeNs());
Long time = Long.valueOf(evictionTimeNs);
replica.setEvictableTimeNs(time);
map.put(time, replica);
}
/**
* Purge a replica from the cache.
*
* This doesn't necessarily close the replica, since there may be
* outstanding references to it. However, it does mean the cache won't
* hand it out to anyone after this.
*
* You must hold the cache lock while calling this function.
*
* @param replica The replica being removed.
*/
private void purge(ShortCircuitReplica replica) {
boolean removedFromInfoMap = false;
String evictionMapName = null;
Preconditions.checkArgument(!replica.purged);
replica.purged = true;
Waitable<ShortCircuitReplicaInfo> val = replicaInfoMap.get(replica.key);
if (val != null) {
ShortCircuitReplicaInfo info = val.getVal();
if ((info != null) && (info.getReplica() == replica)) {
replicaInfoMap.remove(replica.key);
removedFromInfoMap = true;
}
}
Long evictableTimeNs = replica.getEvictableTimeNs();
if (evictableTimeNs != null) {
evictionMapName = removeEvictable(replica);
}
if (LOG.isTraceEnabled()) {
StringBuilder builder = new StringBuilder();
builder.append(this).append(": ").append(": removed ").
append(replica).append(" from the cache.");
if (removedFromInfoMap) {
builder.append(" Removed from the replicaInfoMap.");
}
if (evictionMapName != null) {
builder.append(" Removed from ").append(evictionMapName);
}
LOG.trace(builder.toString());
}
unref(replica);
}
/**
* Fetch or create a replica.
*
* You must hold the cache lock while calling this function.
*
* @param key Key to use for lookup.
* @param creator Replica creator callback. Will be called without
* the cache lock being held.
*
* @return Null if no replica could be found or created.
* The replica, otherwise.
*/
public ShortCircuitReplicaInfo fetchOrCreate(ExtendedBlockId key,
ShortCircuitReplicaCreator creator) {
Waitable<ShortCircuitReplicaInfo> newWaitable = null;
lock.lock();
try {
ShortCircuitReplicaInfo info = null;
do {
if (closed) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": can't fetchOrCreate " + key +
" because the cache is closed.");
}
return null;
}
Waitable<ShortCircuitReplicaInfo> waitable = replicaInfoMap.get(key);
if (waitable != null) {
try {
info = fetch(key, waitable);
} catch (RetriableException e) {
if (LOG.isDebugEnabled()) {
LOG.debug(this + ": retrying " + e.getMessage());
}
continue;
}
}
} while (false);
if (info != null) return info;
// We need to load the replica ourselves.
newWaitable = new Waitable<ShortCircuitReplicaInfo>(lock.newCondition());
replicaInfoMap.put(key, newWaitable);
} finally {
lock.unlock();
}
return create(key, creator, newWaitable);
}
/**
* Fetch an existing ReplicaInfo object.
*
* @param key The key that we're using.
* @param waitable The waitable object to wait on.
* @return The existing ReplicaInfo object, or null if there is
* none.
*
* @throws RetriableException If the caller needs to retry.
*/
private ShortCircuitReplicaInfo fetch(ExtendedBlockId key,
Waitable<ShortCircuitReplicaInfo> waitable) throws RetriableException {
// Another thread is already in the process of loading this
// ShortCircuitReplica. So we simply wait for it to complete.
ShortCircuitReplicaInfo info;
try {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": found waitable for " + key);
}
info = waitable.await();
} catch (InterruptedException e) {
LOG.info(this + ": interrupted while waiting for " + key);
Thread.currentThread().interrupt();
throw new RetriableException("interrupted");
}
if (info.getInvalidTokenException() != null) {
LOG.warn(this + ": could not get " + key + " due to InvalidToken " +
"exception.", info.getInvalidTokenException());
return info;
}
ShortCircuitReplica replica = info.getReplica();
if (replica == null) {
LOG.warn(this + ": failed to get " + key);
return info;
}
if (replica.purged) {
// Ignore replicas that have already been purged from the cache.
throw new RetriableException("Ignoring purged replica " +
replica + ". Retrying.");
}
// Check if the replica is stale before using it.
// If it is, purge it and retry.
if (replica.isStale()) {
LOG.info(this + ": got stale replica " + replica + ". Removing " +
"this replica from the replicaInfoMap and retrying.");
// Remove the cache's reference to the replica. This may or may not
// trigger a close.
purge(replica);
throw new RetriableException("ignoring stale replica " + replica);
}
ref(replica);
return info;
}
private ShortCircuitReplicaInfo create(ExtendedBlockId key,
ShortCircuitReplicaCreator creator,
Waitable<ShortCircuitReplicaInfo> newWaitable) {
// Handle loading a new replica.
ShortCircuitReplicaInfo info = null;
try {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": loading " + key);
}
info = creator.createShortCircuitReplicaInfo();
} catch (RuntimeException e) {
LOG.warn(this + ": failed to load " + key, e);
}
if (info == null) info = new ShortCircuitReplicaInfo();
lock.lock();
try {
if (info.getReplica() != null) {
// On success, make sure the cache cleaner thread is running.
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": successfully loaded " + info.getReplica());
}
startCacheCleanerThreadIfNeeded();
// Note: new ShortCircuitReplicas start with a refCount of 2,
// indicating that both this cache and whoever requested the
// creation of the replica hold a reference. So we don't need
// to increment the reference count here.
} else {
// On failure, remove the waitable from the replicaInfoMap.
Waitable<ShortCircuitReplicaInfo> waitableInMap = replicaInfoMap.get(key);
if (waitableInMap == newWaitable) replicaInfoMap.remove(key);
if (info.getInvalidTokenException() != null) {
LOG.warn(this + ": could not load " + key + " due to InvalidToken " +
"exception.", info.getInvalidTokenException());
} else {
LOG.warn(this + ": failed to load " + key);
}
}
newWaitable.provide(info);
} finally {
lock.unlock();
}
return info;
}
private void startCacheCleanerThreadIfNeeded() {
if (cacheCleaner == null) {
cacheCleaner = new CacheCleaner();
long rateMs = cacheCleaner.getRateInMs();
ScheduledFuture<?> future =
executor.scheduleAtFixedRate(cacheCleaner, rateMs, rateMs,
TimeUnit.MILLISECONDS);
cacheCleaner.setFuture(future);
if (LOG.isDebugEnabled()) {
LOG.debug(this + ": starting cache cleaner thread which will run " +
"every " + rateMs + " ms");
}
}
}
ClientMmap getOrCreateClientMmap(ShortCircuitReplica replica) {
Condition newCond;
lock.lock();
try {
while (replica.mmapData != null) {
if (replica.mmapData instanceof ClientMmap) {
ref(replica);
ClientMmap clientMmap = (ClientMmap)replica.mmapData;
clientMmap.ref();
return clientMmap;
} else if (replica.mmapData instanceof Long) {
long lastAttemptTimeMs = (Long)replica.mmapData;
long delta = Time.monotonicNow() - lastAttemptTimeMs;
if (delta < staleThresholdMs) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": can't create client mmap for " +
replica + " because we failed to " +
"create one just " + delta + "ms ago.");
}
return null;
}
if (LOG.isTraceEnabled()) {
LOG.trace(this + ": retrying client mmap for " + replica +
", " + delta + " ms after the previous failure.");
}
} else if (replica.mmapData instanceof Condition) {
Condition cond = (Condition)replica.mmapData;
cond.awaitUninterruptibly();
} else {
Preconditions.checkState(false, "invalid mmapData type " +
replica.mmapData.getClass().getName());
}
}
newCond = lock.newCondition();
replica.mmapData = newCond;
} finally {
lock.unlock();
}
MappedByteBuffer map = replica.loadMmapInternal();
lock.lock();
try {
if (map == null) {
replica.mmapData = Long.valueOf(Time.monotonicNow());
newCond.signalAll();
return null;
} else {
ClientMmap clientMmap = new ClientMmap(replica, map);
outstandingMmapCount++;
replica.mmapData = clientMmap;
ref(replica);
newCond.signalAll();
return clientMmap;
}
} finally {
lock.unlock();
}
}
/**
* Close the cache and free all associated resources.
*/
public void close() {
try {
lock.lock();
if (closed) return;
closed = true;
LOG.info(this + ": closing");
maxNonMmappedEvictableLifespanMs = 0;
maxEvictableMmapedSize = 0;
// Close and join cacheCleaner thread.
IOUtils.cleanup(LOG, cacheCleaner);
// Purge all replicas.
while (true) {
Entry<Long, ShortCircuitReplica> entry = evictable.firstEntry();
if (entry == null) break;
purge(entry.getValue());
}
while (true) {
Entry<Long, ShortCircuitReplica> entry = evictableMmapped.firstEntry();
if (entry == null) break;
purge(entry.getValue());
}
} finally {
lock.unlock();
}
}
@VisibleForTesting // ONLY for testing
public interface CacheVisitor {
void visit(int numOutstandingMmaps,
Map<ExtendedBlockId, ShortCircuitReplica> replicas,
Map<ExtendedBlockId, InvalidToken> failedLoads,
Map<Long, ShortCircuitReplica> evictable,
Map<Long, ShortCircuitReplica> evictableMmapped);
}
@VisibleForTesting // ONLY for testing
public void accept(CacheVisitor visitor) {
lock.lock();
try {
Map<ExtendedBlockId, ShortCircuitReplica> replicas =
new HashMap<ExtendedBlockId, ShortCircuitReplica>();
Map<ExtendedBlockId, InvalidToken> failedLoads =
new HashMap<ExtendedBlockId, InvalidToken>();
for (Entry<ExtendedBlockId, Waitable<ShortCircuitReplicaInfo>> entry :
replicaInfoMap.entrySet()) {
Waitable<ShortCircuitReplicaInfo> waitable = entry.getValue();
if (waitable.hasVal()) {
if (waitable.getVal().getReplica() != null) {
replicas.put(entry.getKey(), waitable.getVal().getReplica());
} else {
// The exception may be null here, indicating a failed load that
// isn't the result of an invalid block token.
failedLoads.put(entry.getKey(),
waitable.getVal().getInvalidTokenException());
}
}
}
if (LOG.isDebugEnabled()) {
StringBuilder builder = new StringBuilder();
builder.append("visiting ").append(visitor.getClass().getName()).
append("with outstandingMmapCount=").append(outstandingMmapCount).
append(", replicas=");
String prefix = "";
for (Entry<ExtendedBlockId, ShortCircuitReplica> entry : replicas.entrySet()) {
builder.append(prefix).append(entry.getValue());
prefix = ",";
}
prefix = "";
builder.append(", failedLoads=");
for (Entry<ExtendedBlockId, InvalidToken> entry : failedLoads.entrySet()) {
builder.append(prefix).append(entry.getValue());
prefix = ",";
}
prefix = "";
builder.append(", evictable=");
for (Entry<Long, ShortCircuitReplica> entry : evictable.entrySet()) {
builder.append(prefix).append(entry.getKey()).
append(":").append(entry.getValue());
prefix = ",";
}
prefix = "";
builder.append(", evictableMmapped=");
for (Entry<Long, ShortCircuitReplica> entry : evictableMmapped.entrySet()) {
builder.append(prefix).append(entry.getKey()).
append(":").append(entry.getValue());
prefix = ",";
}
LOG.debug(builder.toString());
}
visitor.visit(outstandingMmapCount, replicas, failedLoads,
evictable, evictableMmapped);
} finally {
lock.unlock();
}
}
@Override
public String toString() {
return "ShortCircuitCache(0x" +
Integer.toHexString(System.identityHashCode(this)) + ")";
}
}

View File

@ -0,0 +1,268 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.client;
import org.apache.hadoop.classification.InterfaceAudience;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.ExtendedBlockId;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.util.Time;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
/**
* A ShortCircuitReplica object contains file descriptors for a block that
* we are reading via short-circuit local reads.
*
* The file descriptors can be shared between multiple threads because
* all the operations we perform are stateless-- i.e., we use pread
* instead of read, to avoid using the shared position state.
*/
@InterfaceAudience.Private
public class ShortCircuitReplica {
public static final Log LOG = LogFactory.getLog(ShortCircuitCache.class);
/**
* Identifies this ShortCircuitReplica object.
*/
final ExtendedBlockId key;
/**
* The block data input stream.
*/
private final FileInputStream dataStream;
/**
* The block metadata input stream.
*
* TODO: make this nullable if the file has no checksums on disk.
*/
private final FileInputStream metaStream;
/**
* Block metadata header.
*/
private final BlockMetadataHeader metaHeader;
/**
* The cache we belong to.
*/
private final ShortCircuitCache cache;
/**
* Monotonic time at which the replica was created.
*/
private final long creationTimeMs;
/**
* Current mmap state.
*
* Protected by the cache lock.
*/
Object mmapData;
/**
* True if this replica has been purged from the cache; false otherwise.
*
* Protected by the cache lock.
*/
boolean purged = false;
/**
* Number of external references to this replica. Replicas are referenced
* by the cache, BlockReaderLocal instances, and by ClientMmap instances.
* The number starts at 2 because when we create a replica, it is referenced
* by both the cache and the requester.
*
* Protected by the cache lock.
*/
int refCount = 2;
/**
* The monotonic time in nanoseconds at which the replica became evictable, or
* null if it is not evictable.
*
* Protected by the cache lock.
*/
private Long evictableTimeNs = null;
public ShortCircuitReplica(ExtendedBlockId key,
FileInputStream dataStream, FileInputStream metaStream,
ShortCircuitCache cache, long creationTimeMs) throws IOException {
this.key = key;
this.dataStream = dataStream;
this.metaStream = metaStream;
this.metaHeader =
BlockMetadataHeader.preadHeader(metaStream.getChannel());
if (metaHeader.getVersion() != 1) {
throw new IOException("invalid metadata header version " +
metaHeader.getVersion() + ". Can only handle version 1.");
}
this.cache = cache;
this.creationTimeMs = creationTimeMs;
}
/**
* Decrement the reference count.
*/
public void unref() {
cache.unref(this);
}
/**
* Check if the replica is stale.
*
* Must be called with the cache lock held.
*/
boolean isStale() {
long deltaMs = Time.monotonicNow() - creationTimeMs;
long staleThresholdMs = cache.getStaleThresholdMs();
if (deltaMs > staleThresholdMs) {
if (LOG.isTraceEnabled()) {
LOG.trace(this + " is stale because it's " + deltaMs +
" ms old, and staleThresholdMs = " + staleThresholdMs);
}
return true;
} else {
if (LOG.isTraceEnabled()) {
LOG.trace(this + " is not stale because it's only " + deltaMs +
" ms old, and staleThresholdMs = " + staleThresholdMs);
}
return false;
}
}
/**
* Check if the replica has an associated mmap that has been fully loaded.
*
* Must be called with the cache lock held.
*/
@VisibleForTesting
public boolean hasMmap() {
return ((mmapData != null) && (mmapData instanceof ClientMmap));
}
/**
* Free the mmap associated with this replica.
*
* Must be called with the cache lock held.
*/
void munmap() {
ClientMmap clientMmap = (ClientMmap)mmapData;
NativeIO.POSIX.munmap(clientMmap.getMappedByteBuffer());
mmapData = null;
}
/**
* Close the replica.
*
* Must be called after there are no more references to the replica in the
* cache or elsewhere.
*/
void close() {
Preconditions.checkState(refCount == 0,
"tried to close replica with refCount " + refCount + ": " + this);
Preconditions.checkState(purged,
"tried to close unpurged replica " + this);
if (hasMmap()) munmap();
IOUtils.cleanup(LOG, dataStream, metaStream);
}
public FileInputStream getDataStream() {
return dataStream;
}
public FileInputStream getMetaStream() {
return metaStream;
}
public BlockMetadataHeader getMetaHeader() {
return metaHeader;
}
public ExtendedBlockId getKey() {
return key;
}
public ClientMmap getOrCreateClientMmap() {
return cache.getOrCreateClientMmap(this);
}
MappedByteBuffer loadMmapInternal() {
try {
FileChannel channel = dataStream.getChannel();
return channel.map(MapMode.READ_ONLY, 0, channel.size());
} catch (IOException e) {
LOG.warn(this + ": mmap error", e);
return null;
} catch (RuntimeException e) {
LOG.warn(this + ": mmap error", e);
return null;
}
}
/**
* Get the evictable time in nanoseconds.
*
* Note: you must hold the cache lock to call this function.
*
* @return the evictable time in nanoseconds.
*/
public Long getEvictableTimeNs() {
return evictableTimeNs;
}
/**
* Set the evictable time in nanoseconds.
*
* Note: you must hold the cache lock to call this function.
*
* @param evictableTimeNs The evictable time in nanoseconds, or null
* to set no evictable time.
*/
void setEvictableTimeNs(Long evictableTimeNs) {
this.evictableTimeNs = evictableTimeNs;
}
/**
* Convert the replica to a string for debugging purposes.
* Note that we can't take the lock here.
*/
@Override
public String toString() {
return new StringBuilder().append("ShortCircuitReplica{").
append("key=").append(key).
append(", metaHeader.version=").append(metaHeader.getVersion()).
append(", metaHeader.checksum=").append(metaHeader.getChecksum()).
append(", ident=").append("0x").
append(Integer.toHexString(System.identityHashCode(this))).
append(", creationTimeMs=").append(creationTimeMs).
append("}").toString();
}
}

View File

@ -0,0 +1,64 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.client;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
public final class ShortCircuitReplicaInfo {
private final ShortCircuitReplica replica;
private final InvalidToken exc;
public ShortCircuitReplicaInfo() {
this.replica = null;
this.exc = null;
}
public ShortCircuitReplicaInfo(ShortCircuitReplica replica) {
this.replica = replica;
this.exc = null;
}
public ShortCircuitReplicaInfo(InvalidToken exc) {
this.replica = null;
this.exc = exc;
}
public ShortCircuitReplica getReplica() {
return replica;
}
public InvalidToken getInvalidTokenException() {
return exc;
}
public String toString() {
StringBuilder builder = new StringBuilder();
String prefix = "";
builder.append("ShortCircuitReplicaInfo{");
if (replica != null) {
builder.append(prefix).append(replica);
prefix = ", ";
}
if (exc != null) {
builder.append(prefix).append(exc);
prefix = ", ";
}
builder.append("}");
return builder.toString();
}
}

View File

@ -102,9 +102,10 @@ public DatanodeProtocolClientSideTranslatorPB(InetSocketAddress nameNodeAddr,
private static DatanodeProtocolPB createNamenode(
InetSocketAddress nameNodeAddr, Configuration conf,
UserGroupInformation ugi) throws IOException {
return RPC.getProxy(DatanodeProtocolPB.class,
return RPC.getProtocolProxy(DatanodeProtocolPB.class,
RPC.getProtocolVersion(DatanodeProtocolPB.class), nameNodeAddr, ugi,
conf, NetUtils.getSocketFactory(conf, DatanodeProtocolPB.class));
conf, NetUtils.getSocketFactory(conf, DatanodeProtocolPB.class),
org.apache.hadoop.ipc.Client.getPingInterval(conf), null).getProxy();
}
/** Create a {@link NameNode} proxy */

View File

@ -18,7 +18,26 @@
package org.apache.hadoop.hdfs.server.common;
import com.google.common.base.Charsets;
import static org.apache.hadoop.fs.CommonConfigurationKeys.DEFAULT_HADOOP_HTTP_STATIC_USER;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_HTTP_STATIC_USER;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
import java.net.URLEncoder;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import javax.servlet.ServletContext;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.jsp.JspWriter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -27,10 +46,17 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.BlockReader;
import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.ClientContext;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.RemotePeerFactory;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.*;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
@ -53,22 +79,7 @@
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.VersionInfo;
import javax.servlet.ServletContext;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.jsp.JspWriter;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
import java.net.URLEncoder;
import java.util.*;
import static org.apache.hadoop.fs.CommonConfigurationKeys.DEFAULT_HADOOP_HTTP_STATIC_USER;
import static org.apache.hadoop.fs.CommonConfigurationKeys.HADOOP_HTTP_STATIC_USER;
import com.google.common.base.Charsets;
@InterfaceAudience.Private
public class JspHelper {
@ -168,101 +179,97 @@ public static DatanodeInfo bestNode(LocatedBlocks blks, Configuration conf)
}
NodeRecord[] nodes = map.values().toArray(new NodeRecord[map.size()]);
Arrays.sort(nodes, new NodeRecordComparator());
return bestNode(nodes, false, conf);
return bestNode(nodes, false);
}
public static DatanodeInfo bestNode(LocatedBlock blk, Configuration conf)
throws IOException {
DatanodeInfo[] nodes = blk.getLocations();
return bestNode(nodes, true, conf);
return bestNode(nodes, true);
}
public static DatanodeInfo bestNode(DatanodeInfo[] nodes, boolean doRandom,
Configuration conf) throws IOException {
TreeSet<DatanodeInfo> deadNodes = new TreeSet<DatanodeInfo>();
DatanodeInfo chosenNode = null;
int failures = 0;
Socket s = null;
int index = -1;
private static DatanodeInfo bestNode(DatanodeInfo[] nodes, boolean doRandom)
throws IOException {
if (nodes == null || nodes.length == 0) {
throw new IOException("No nodes contain this block");
}
while (s == null) {
if (chosenNode == null) {
do {
if (doRandom) {
index = DFSUtil.getRandom().nextInt(nodes.length);
} else {
index++;
}
chosenNode = nodes[index];
} while (deadNodes.contains(chosenNode));
}
chosenNode = nodes[index];
//just ping to check whether the node is alive
InetSocketAddress targetAddr = NetUtils.createSocketAddr(
chosenNode.getInfoAddr());
try {
s = NetUtils.getDefaultSocketFactory(conf).createSocket();
s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
} catch (IOException e) {
deadNodes.add(chosenNode);
IOUtils.closeSocket(s);
s = null;
failures++;
}
if (failures == nodes.length)
throw new IOException("Could not reach the block containing the data. Please try again");
int l = 0;
while (l < nodes.length && !nodes[l].isDecommissioned()) {
++l;
}
s.close();
return chosenNode;
if (l == 0) {
throw new IOException("No active nodes contain this block");
}
int index = doRandom ? DFSUtil.getRandom().nextInt(l) : 0;
return nodes[index];
}
public static void streamBlockInAscii(InetSocketAddress addr, String poolId,
long blockId, Token<BlockTokenIdentifier> blockToken, long genStamp,
long blockSize, long offsetIntoBlock, long chunkSizeToView,
JspWriter out, Configuration conf, DFSClient.Conf dfsConf,
DataEncryptionKey encryptionKey)
JspWriter out, final Configuration conf, DFSClient.Conf dfsConf,
final DataEncryptionKey encryptionKey)
throws IOException {
if (chunkSizeToView == 0) return;
Socket s = NetUtils.getDefaultSocketFactory(conf).createSocket();
s.connect(addr, HdfsServerConstants.READ_TIMEOUT);
s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
int amtToRead = (int)Math.min(chunkSizeToView, blockSize - offsetIntoBlock);
// Use the block name for file name.
String file = BlockReaderFactory.getFileName(addr, poolId, blockId);
BlockReader blockReader = BlockReaderFactory.newBlockReader(dfsConf, file,
new ExtendedBlock(poolId, blockId, 0, genStamp), blockToken,
offsetIntoBlock, amtToRead, true,
"JspHelper", TcpPeerServer.peerFromSocketAndKey(s, encryptionKey),
new DatanodeID(addr.getAddress().getHostAddress(),
addr.getHostName(), poolId, addr.getPort(), 0, 0, 0), null,
null, null, false, CachingStrategy.newDefaultStrategy());
BlockReader blockReader = new BlockReaderFactory(dfsConf).
setInetSocketAddress(addr).
setBlock(new ExtendedBlock(poolId, blockId, 0, genStamp)).
setFileName(BlockReaderFactory.getFileName(addr, poolId, blockId)).
setBlockToken(blockToken).
setStartOffset(offsetIntoBlock).
setLength(amtToRead).
setVerifyChecksum(true).
setClientName("JspHelper").
setClientCacheContext(ClientContext.getFromConf(conf)).
setDatanodeInfo(new DatanodeInfo(
new DatanodeID(addr.getAddress().getHostAddress(),
addr.getHostName(), poolId, addr.getPort(), 0, 0, 0))).
setCachingStrategy(CachingStrategy.newDefaultStrategy()).
setConfiguration(conf).
setRemotePeerFactory(new RemotePeerFactory() {
@Override
public Peer newConnectedPeer(InetSocketAddress addr)
throws IOException {
Peer peer = null;
Socket sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
try {
sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
peer = TcpPeerServer.peerFromSocketAndKey(sock, encryptionKey);
} finally {
if (peer == null) {
IOUtils.closeSocket(sock);
}
}
return peer;
}
}).
build();
final byte[] buf = new byte[amtToRead];
int readOffset = 0;
int retries = 2;
while ( amtToRead > 0 ) {
int numRead = amtToRead;
try {
blockReader.readFully(buf, readOffset, amtToRead);
try {
int readOffset = 0;
int retries = 2;
while (amtToRead > 0) {
int numRead = amtToRead;
try {
blockReader.readFully(buf, readOffset, amtToRead);
} catch (IOException e) {
retries--;
if (retries == 0)
throw new IOException("Could not read data from datanode");
continue;
}
amtToRead -= numRead;
readOffset += numRead;
}
catch (IOException e) {
retries--;
if (retries == 0)
throw new IOException("Could not read data from datanode");
continue;
}
amtToRead -= numRead;
readOffset += numRead;
} finally {
blockReader.close();
}
blockReader.close();
out.print(HtmlQuoting.quoteHtmlChars(new String(buf, Charsets.UTF_8)));
}

View File

@ -34,6 +34,8 @@
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import com.google.common.annotations.VisibleForTesting;
/**
@ -55,7 +57,8 @@ public class BlockMetadataHeader {
private short version;
private DataChecksum checksum = null;
BlockMetadataHeader(short version, DataChecksum checksum) {
@VisibleForTesting
public BlockMetadataHeader(short version, DataChecksum checksum) {
this.checksum = checksum;
this.version = version;
}
@ -148,7 +151,8 @@ private static BlockMetadataHeader readHeader(short version, DataInputStream in)
* @return
* @throws IOException
*/
private static void writeHeader(DataOutputStream out,
@VisibleForTesting
public static void writeHeader(DataOutputStream out,
BlockMetadataHeader header)
throws IOException {
out.writeShort(header.getVersion());

View File

@ -37,12 +37,12 @@
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.builder.HashCodeBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.ExtendedBlockId;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -56,43 +56,6 @@
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class FsDatasetCache {
/**
* Keys which identify MappableBlocks.
*/
private static final class Key {
/**
* Block id.
*/
final long id;
/**
* Block pool id.
*/
final String bpid;
Key(long id, String bpid) {
this.id = id;
this.bpid = bpid;
}
@Override
public boolean equals(Object o) {
if (o == null) {
return false;
}
if (!(o.getClass() == getClass())) {
return false;
}
Key other = (Key)o;
return ((other.id == this.id) && (other.bpid.equals(this.bpid)));
}
@Override
public int hashCode() {
return new HashCodeBuilder().append(id).append(bpid).hashCode();
}
};
/**
* MappableBlocks that we know about.
*/
@ -143,7 +106,8 @@ public boolean shouldAdvertise() {
/**
* Stores MappableBlock objects and the states they're in.
*/
private final HashMap<Key, Value> mappableBlockMap = new HashMap<Key, Value>();
private final HashMap<ExtendedBlockId, Value> mappableBlockMap =
new HashMap<ExtendedBlockId, Value>();
private final AtomicLong numBlocksCached = new AtomicLong(0);
@ -260,12 +224,12 @@ public FsDatasetCache(FsDatasetImpl dataset) {
*/
synchronized List<Long> getCachedBlocks(String bpid) {
List<Long> blocks = new ArrayList<Long>();
for (Iterator<Entry<Key, Value>> iter =
for (Iterator<Entry<ExtendedBlockId, Value>> iter =
mappableBlockMap.entrySet().iterator(); iter.hasNext(); ) {
Entry<Key, Value> entry = iter.next();
if (entry.getKey().bpid.equals(bpid)) {
Entry<ExtendedBlockId, Value> entry = iter.next();
if (entry.getKey().getBlockPoolId().equals(bpid)) {
if (entry.getValue().state.shouldAdvertise()) {
blocks.add(entry.getKey().id);
blocks.add(entry.getKey().getBlockId());
}
}
}
@ -278,7 +242,7 @@ synchronized List<Long> getCachedBlocks(String bpid) {
synchronized void cacheBlock(long blockId, String bpid,
String blockFileName, long length, long genstamp,
Executor volumeExecutor) {
Key key = new Key(blockId, bpid);
ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
Value prevValue = mappableBlockMap.get(key);
if (prevValue != null) {
if (LOG.isDebugEnabled()) {
@ -299,7 +263,7 @@ synchronized void cacheBlock(long blockId, String bpid,
}
synchronized void uncacheBlock(String bpid, long blockId) {
Key key = new Key(blockId, bpid);
ExtendedBlockId key = new ExtendedBlockId(blockId, bpid);
Value prevValue = mappableBlockMap.get(key);
if (prevValue == null) {
@ -344,12 +308,12 @@ synchronized void uncacheBlock(String bpid, long blockId) {
* Background worker that mmaps, mlocks, and checksums a block
*/
private class CachingTask implements Runnable {
private final Key key;
private final ExtendedBlockId key;
private final String blockFileName;
private final long length;
private final long genstamp;
CachingTask(Key key, String blockFileName, long length, long genstamp) {
CachingTask(ExtendedBlockId key, String blockFileName, long length, long genstamp) {
this.key = key;
this.blockFileName = blockFileName;
this.length = length;
@ -361,13 +325,13 @@ public void run() {
boolean success = false;
FileInputStream blockIn = null, metaIn = null;
MappableBlock mappableBlock = null;
ExtendedBlock extBlk =
new ExtendedBlock(key.bpid, key.id, length, genstamp);
ExtendedBlock extBlk = new ExtendedBlock(key.getBlockPoolId(),
key.getBlockId(), length, genstamp);
long newUsedBytes = usedBytesCount.reserve(length);
if (newUsedBytes < 0) {
LOG.warn("Failed to cache block id " + key.id + ", pool " + key.bpid +
": could not reserve " + length + " more bytes in the " +
"cache: " + DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
LOG.warn("Failed to cache " + key + ": could not reserve " + length +
" more bytes in the cache: " +
DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY +
" of " + maxBytes + " exceeded.");
numBlocksFailedToCache.incrementAndGet();
return;
@ -378,16 +342,15 @@ public void run() {
metaIn = (FileInputStream)dataset.getMetaDataInputStream(extBlk)
.getWrappedStream();
} catch (ClassCastException e) {
LOG.warn("Failed to cache block with id " + key.id + ", pool " +
key.bpid + ": Underlying blocks are not backed by files.", e);
LOG.warn("Failed to cache " + key +
": Underlying blocks are not backed by files.", e);
return;
} catch (FileNotFoundException e) {
LOG.info("Failed to cache block with id " + key.id + ", pool " +
key.bpid + ": failed to find backing files.");
LOG.info("Failed to cache " + key + ": failed to find backing " +
"files.");
return;
} catch (IOException e) {
LOG.warn("Failed to cache block with id " + key.id + ", pool " +
key.bpid + ": failed to open file", e);
LOG.warn("Failed to cache " + key + ": failed to open file", e);
return;
}
try {
@ -395,11 +358,10 @@ public void run() {
load(length, blockIn, metaIn, blockFileName);
} catch (ChecksumException e) {
// Exception message is bogus since this wasn't caused by a file read
LOG.warn("Failed to cache block " + key.id + " in " + key.bpid + ": " +
"checksum verification failed.");
LOG.warn("Failed to cache " + key + ": checksum verification failed.");
return;
} catch (IOException e) {
LOG.warn("Failed to cache block " + key.id + " in " + key.bpid, e);
LOG.warn("Failed to cache " + key, e);
return;
}
synchronized (FsDatasetCache.this) {
@ -409,15 +371,14 @@ public void run() {
value.state == State.CACHING_CANCELLED);
if (value.state == State.CACHING_CANCELLED) {
mappableBlockMap.remove(key);
LOG.warn("Caching of block " + key.id + " in " + key.bpid +
" was cancelled.");
LOG.warn("Caching of " + key + " was cancelled.");
return;
}
mappableBlockMap.put(key, new Value(mappableBlock, State.CACHED));
}
if (LOG.isDebugEnabled()) {
LOG.debug("Successfully cached block " + key.id + " in " + key.bpid +
". We are now caching " + newUsedBytes + " bytes in total.");
LOG.debug("Successfully cached " + key + ". We are now caching " +
newUsedBytes + " bytes in total.");
}
numBlocksCached.addAndGet(1);
success = true;
@ -425,9 +386,8 @@ public void run() {
if (!success) {
newUsedBytes = usedBytesCount.release(length);
if (LOG.isDebugEnabled()) {
LOG.debug("Caching of block " + key.id + " in " +
key.bpid + " was aborted. We are now caching only " +
newUsedBytes + " + bytes in total.");
LOG.debug("Caching of " + key + " was aborted. We are now " +
"caching only " + newUsedBytes + " + bytes in total.");
}
IOUtils.closeQuietly(blockIn);
IOUtils.closeQuietly(metaIn);
@ -445,9 +405,9 @@ public void run() {
}
private class UncachingTask implements Runnable {
private final Key key;
private final ExtendedBlockId key;
UncachingTask(Key key) {
UncachingTask(ExtendedBlockId key) {
this.key = key;
}
@ -470,8 +430,8 @@ public void run() {
usedBytesCount.release(value.mappableBlock.getLength());
numBlocksCached.addAndGet(-1);
if (LOG.isDebugEnabled()) {
LOG.debug("Uncaching of block " + key.id + " in " + key.bpid +
" completed. usedBytes = " + newUsedBytes);
LOG.debug("Uncaching of " + key + " completed. " +
"usedBytes = " + newUsedBytes);
}
}
}

View File

@ -587,6 +587,8 @@ public void toXML(XMLOutputter doc) throws IOException {
toXmlItemBlockWithLink(doc, nn.host, nn.httpAddress, "NameNode");
toXmlItemBlock(doc, "Blockpool Used",
StringUtils.byteDesc(nn.bpUsed));
toXmlItemBlock(doc, "Blockpool Used%",
DFSUtil.percent2String(DFSUtil.getPercentUsed(nn.bpUsed, total)));
toXmlItemBlock(doc, "Files And Directories",
Long.toString(nn.filesAndDirectories));
toXmlItemBlock(doc, "Blocks", Long.toString(nn.blocksCount));

View File

@ -49,9 +49,6 @@
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeDirectorySection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.AclFeatureProto;
import org.apache.hadoop.hdfs.server.namenode.INodeReference.DstReference;
import org.apache.hadoop.hdfs.server.namenode.INodeReference.WithCount;
import org.apache.hadoop.hdfs.server.namenode.INodeReference.WithName;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.util.ReadOnlyList;
@ -107,25 +104,6 @@ public static ImmutableList<AclEntry> loadAclEntries(
return b.build();
}
public static INodeReference loadINodeReference(
INodeSection.INodeReference r, FSDirectory dir) throws IOException {
long referredId = r.getReferredId();
INode referred = dir.getInode(referredId);
WithCount withCount = (WithCount) referred.getParentReference();
if (withCount == null) {
withCount = new INodeReference.WithCount(null, referred);
}
final INodeReference ref;
if (r.hasDstSnapshotId()) { // DstReference
ref = new INodeReference.DstReference(null, withCount,
r.getDstSnapshotId());
} else {
ref = new INodeReference.WithName(null, withCount, r.getName()
.toByteArray(), r.getLastSnapshotId());
}
return ref;
}
public static INodeDirectory loadINodeDirectory(INodeSection.INode n,
LoaderContext state) {
assert n.getType() == INodeSection.INode.Type.DIRECTORY;
@ -169,6 +147,8 @@ public static void updateBlocksMap(INodeFile file, BlockManager bm) {
}
void loadINodeDirectorySection(InputStream in) throws IOException {
final List<INodeReference> refList = parent.getLoaderContext()
.getRefList();
while (true) {
INodeDirectorySection.DirEntry e = INodeDirectorySection.DirEntry
.parseDelimitedFrom(in);
@ -181,20 +161,13 @@ void loadINodeDirectorySection(InputStream in) throws IOException {
INode child = dir.getInode(id);
addToParent(p, child);
}
for (int i = 0; i < e.getNumOfRef(); i++) {
INodeReference ref = loadINodeReference(in);
for (int refId : e.getRefChildrenList()) {
INodeReference ref = refList.get(refId);
addToParent(p, ref);
}
}
}
private INodeReference loadINodeReference(InputStream in)
throws IOException {
INodeSection.INodeReference ref = INodeSection.INodeReference
.parseDelimitedFrom(in);
return loadINodeReference(ref, dir);
}
void loadINodeSection(InputStream in) throws IOException {
INodeSection s = INodeSection.parseDelimitedFrom(in);
fsn.resetLastInodeId(s.getLastInodeId());
@ -379,19 +352,6 @@ public static INodeSection.INodeDirectory.Builder buildINodeDirectory(
return b;
}
public static INodeSection.INodeReference.Builder buildINodeReference(
INodeReference ref) throws IOException {
INodeSection.INodeReference.Builder rb = INodeSection.INodeReference
.newBuilder().setReferredId(ref.getId());
if (ref instanceof WithName) {
rb.setLastSnapshotId(((WithName) ref).getLastSnapshotId()).setName(
ByteString.copyFrom(ref.getLocalNameBytes()));
} else if (ref instanceof DstReference) {
rb.setDstSnapshotId(((DstReference) ref).getDstSnapshotId());
}
return rb;
}
private final FSNamesystem fsn;
private final FileSummary.Builder summary;
private final SaveNamespaceContext context;
@ -407,6 +367,8 @@ public static INodeSection.INodeReference.Builder buildINodeReference(
void serializeINodeDirectorySection(OutputStream out) throws IOException {
Iterator<INodeWithAdditionalFields> iter = fsn.getFSDirectory()
.getINodeMap().getMapIterator();
final ArrayList<INodeReference> refList = parent.getSaverContext()
.getRefList();
int i = 0;
while (iter.hasNext()) {
INodeWithAdditionalFields n = iter.next();
@ -419,21 +381,16 @@ void serializeINodeDirectorySection(OutputStream out) throws IOException {
if (children.size() > 0) {
INodeDirectorySection.DirEntry.Builder b = INodeDirectorySection.
DirEntry.newBuilder().setParent(n.getId());
List<INodeReference> refs = new ArrayList<INodeReference>();
for (INode inode : children) {
if (!inode.isReference()) {
b.addChildren(inode.getId());
} else {
refs.add(inode.asReference());
refList.add(inode.asReference());
b.addRefChildren(refList.size() - 1);
}
}
b.setNumOfRef(refs.size());
INodeDirectorySection.DirEntry e = b.build();
e.writeDelimitedTo(out);
for (INodeReference ref : refs) {
INodeSection.INodeReference.Builder rb = buildINodeReference(ref);
rb.build().writeDelimitedTo(out);
}
}
++i;

View File

@ -75,10 +75,15 @@ public final class FSImageFormatProtobuf {
public static final class LoaderContext {
private String[] stringTable;
private final ArrayList<INodeReference> refList = Lists.newArrayList();
public String[] getStringTable() {
return stringTable;
}
public ArrayList<INodeReference> getRefList() {
return refList;
}
}
public static final class SaverContext {
@ -111,6 +116,7 @@ Set<Entry<E, Integer>> entrySet() {
return map.entrySet();
}
}
private final ArrayList<INodeReference> refList = Lists.newArrayList();
private final DeduplicationMap<String> stringMap = DeduplicationMap
.newMap();
@ -118,6 +124,10 @@ Set<Entry<E, Integer>> entrySet() {
public DeduplicationMap<String> getStringMap() {
return stringMap;
}
public ArrayList<INodeReference> getRefList() {
return refList;
}
}
public static final class Loader implements FSImageFormat.AbstractLoader {
@ -125,7 +135,6 @@ public static final class Loader implements FSImageFormat.AbstractLoader {
private final Configuration conf;
private final FSNamesystem fsn;
private final LoaderContext ctx;
/** The MD5 sum of the loaded file */
private MD5Hash imgDigest;
/** The transaction ID of the last edit represented by the loaded file */
@ -228,6 +237,9 @@ public int compare(FileSummary.Section s1, FileSummary.Section s2) {
inodeLoader.loadINodeSection(in);
}
break;
case INODE_REFRENCE:
snapshotLoader.loadINodeReferenceSection(in);
break;
case INODE_DIR:
inodeLoader.loadINodeDirectorySection(in);
break;
@ -315,9 +327,10 @@ private void loadCacheManagerSection(InputStream in) throws IOException {
}
public static final class Saver {
public static final int CHECK_CANCEL_INTERVAL = 4096;
private final SaveNamespaceContext context;
private final SaverContext saverContext;
private long currentOffset = FSImageUtil.MAGIC_HEADER.length;
private MD5Hash savedDigest;
@ -326,7 +339,6 @@ public static final class Saver {
private OutputStream sectionOutputStream;
private CompressionCodec codec;
private OutputStream underlyingOutputStream;
public static final int CHECK_CANCEL_INTERVAL = 4096;
Saver(SaveNamespaceContext context) {
this.context = context;
@ -402,6 +414,7 @@ private void saveSnapshots(FileSummary.Builder summary) throws IOException {
snapshotSaver.serializeSnapshotSection(sectionOutputStream);
snapshotSaver.serializeSnapshotDiffSection(sectionOutputStream);
snapshotSaver.serializeINodeReferenceSection(sectionOutputStream);
}
private void saveInternal(FileOutputStream fout,
@ -538,6 +551,7 @@ public enum SectionName {
STRING_TABLE("STRING_TABLE"),
EXTENDED_ACL("EXTENDED_ACL"),
INODE("INODE"),
INODE_REFRENCE("INODE_REFRENCE"),
SNAPSHOT("SNAPSHOT"),
INODE_DIR("INODE_DIR"),
FILES_UNDERCONSTRUCTION("FILES_UNDERCONSTRUCTION"),

View File

@ -32,6 +32,7 @@
import java.util.Map;
import java.util.TreeSet;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
@ -40,9 +41,12 @@
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.hdfs.BlockReader;
import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.ClientContext;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.RemotePeerFactory;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.DirectoryListing;
@ -569,11 +573,10 @@ private void copyBlock(DFSClient dfs, LocatedBlock lblock,
int failures = 0;
InetSocketAddress targetAddr = null;
TreeSet<DatanodeInfo> deadNodes = new TreeSet<DatanodeInfo>();
Socket s = null;
BlockReader blockReader = null;
ExtendedBlock block = lblock.getBlock();
while (s == null) {
while (blockReader == null) {
DatanodeInfo chosenNode;
try {
@ -593,34 +596,47 @@ private void copyBlock(DFSClient dfs, LocatedBlock lblock,
continue;
}
try {
s = NetUtils.getDefaultSocketFactory(conf).createSocket();
s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
String file = BlockReaderFactory.getFileName(targetAddr, block.getBlockPoolId(),
block.getBlockId());
blockReader = BlockReaderFactory.newBlockReader(dfs.getConf(),
file, block, lblock.getBlockToken(), 0, -1, true, "fsck",
TcpPeerServer.peerFromSocketAndKey(s, namenode.getRpcServer().
getDataEncryptionKey()), chosenNode, null, null, null,
false, CachingStrategy.newDropBehind());
String file = BlockReaderFactory.getFileName(targetAddr,
block.getBlockPoolId(), block.getBlockId());
blockReader = new BlockReaderFactory(dfs.getConf()).
setFileName(file).
setBlock(block).
setBlockToken(lblock.getBlockToken()).
setStartOffset(0).
setLength(-1).
setVerifyChecksum(true).
setClientName("fsck").
setDatanodeInfo(chosenNode).
setInetSocketAddress(targetAddr).
setCachingStrategy(CachingStrategy.newDropBehind()).
setClientCacheContext(dfs.getClientContext()).
setConfiguration(namenode.conf).
setRemotePeerFactory(new RemotePeerFactory() {
@Override
public Peer newConnectedPeer(InetSocketAddress addr)
throws IOException {
Peer peer = null;
Socket s = NetUtils.getDefaultSocketFactory(conf).createSocket();
try {
s.connect(addr, HdfsServerConstants.READ_TIMEOUT);
s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
peer = TcpPeerServer.peerFromSocketAndKey(s, namenode.getRpcServer().
getDataEncryptionKey());
} finally {
if (peer == null) {
IOUtils.closeQuietly(s);
}
}
return peer;
}
}).
build();
} catch (IOException ex) {
// Put chosen node into dead list, continue
LOG.info("Failed to connect to " + targetAddr + ":" + ex);
deadNodes.add(chosenNode);
if (s != null) {
try {
s.close();
} catch (IOException iex) {
}
}
s = null;
}
}
if (blockReader == null) {
throw new Exception("Could not open data stream for " + lblock.getBlock());
}
byte[] buf = new byte[1024];
int cnt = 0;
boolean success = true;
@ -638,10 +654,11 @@ private void copyBlock(DFSClient dfs, LocatedBlock lblock,
LOG.error("Error reading block", e);
success = false;
} finally {
try {s.close(); } catch (Exception e1) {}
blockReader.close();
}
if (!success)
if (!success) {
throw new Exception("Could not copy block data for " + lblock.getBlock());
}
}
/*

View File

@ -18,12 +18,10 @@
package org.apache.hadoop.hdfs.server.namenode.snapshot;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.loadINodeDirectory;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.loadINodeReference;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.loadPermission;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Loader.updateBlocksMap;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Saver.buildINodeDirectory;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Saver.buildINodeFile;
import static org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode.Saver.buildINodeReference;
import java.io.IOException;
import java.io.InputStream;
@ -43,8 +41,10 @@
import org.apache.hadoop.hdfs.server.namenode.FSImageFormatPBINode;
import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf;
import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.LoaderContext;
import org.apache.hadoop.hdfs.server.namenode.FSImageFormatProtobuf.SectionName;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeReferenceSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection.CreatedListEntry;
@ -57,6 +57,9 @@
import org.apache.hadoop.hdfs.server.namenode.INodeFileAttributes;
import org.apache.hadoop.hdfs.server.namenode.INodeMap;
import org.apache.hadoop.hdfs.server.namenode.INodeReference;
import org.apache.hadoop.hdfs.server.namenode.INodeReference.DstReference;
import org.apache.hadoop.hdfs.server.namenode.INodeReference.WithCount;
import org.apache.hadoop.hdfs.server.namenode.INodeReference.WithName;
import org.apache.hadoop.hdfs.server.namenode.INodeWithAdditionalFields;
import org.apache.hadoop.hdfs.server.namenode.SaveNamespaceContext;
import org.apache.hadoop.hdfs.server.namenode.snapshot.DirectoryWithSnapshotFeature.DirectoryDiff;
@ -78,7 +81,6 @@ public final static class Loader {
private final FSImageFormatProtobuf.Loader parent;
private final Map<Integer, Snapshot> snapshotMap;
public Loader(FSNamesystem fsn, FSImageFormatProtobuf.Loader parent) {
this.fsn = fsn;
this.fsDir = fsn.getFSDirectory();
@ -86,6 +88,43 @@ public Loader(FSNamesystem fsn, FSImageFormatProtobuf.Loader parent) {
this.parent = parent;
}
/**
* The sequence of the ref node in refList must be strictly the same with
* the sequence in fsimage
*/
public void loadINodeReferenceSection(InputStream in) throws IOException {
final List<INodeReference> refList = parent.getLoaderContext()
.getRefList();
while (true) {
INodeReferenceSection.INodeReference e = INodeReferenceSection
.INodeReference.parseDelimitedFrom(in);
if (e == null) {
break;
}
INodeReference ref = loadINodeReference(e);
refList.add(ref);
}
}
private INodeReference loadINodeReference(
INodeReferenceSection.INodeReference r) throws IOException {
long referredId = r.getReferredId();
INode referred = fsDir.getInode(referredId);
WithCount withCount = (WithCount) referred.getParentReference();
if (withCount == null) {
withCount = new INodeReference.WithCount(null, referred);
}
final INodeReference ref;
if (r.hasDstSnapshotId()) { // DstReference
ref = new INodeReference.DstReference(null, withCount,
r.getDstSnapshotId());
} else {
ref = new INodeReference.WithName(null, withCount, r.getName()
.toByteArray(), r.getLastSnapshotId());
}
return ref;
}
/**
* Load the snapshots section from fsimage. Also convert snapshottable
* directories into {@link INodeDirectorySnapshottable}.
@ -134,6 +173,8 @@ private void loadSnapshots(InputStream in, int size) throws IOException {
* Load the snapshot diff section from fsimage.
*/
public void loadSnapshotDiffSection(InputStream in) throws IOException {
final List<INodeReference> refList = parent.getLoaderContext()
.getRefList();
while (true) {
SnapshotDiffSection.DiffEntry entry = SnapshotDiffSection.DiffEntry
.parseDelimitedFrom(in);
@ -148,7 +189,8 @@ public void loadSnapshotDiffSection(InputStream in) throws IOException {
loadFileDiffList(in, inode.asFile(), entry.getNumOfDiff());
break;
case DIRECTORYDIFF:
loadDirectoryDiffList(in, inode.asDirectory(), entry.getNumOfDiff());
loadDirectoryDiffList(in, inode.asDirectory(), entry.getNumOfDiff(),
refList);
break;
}
}
@ -209,13 +251,13 @@ private void addToDeletedList(INode dnode, INodeDirectory parent) {
/**
* Load the deleted list in a DirectoryDiff
* @param totalSize the total size of the deleted list
* @param deletedNodes non-reference inodes in the deleted list. These
* inodes' ids are directly recorded in protobuf
*/
private List<INode> loadDeletedList(InputStream in, INodeDirectory dir,
int refNum, List<Long> deletedNodes) throws IOException {
List<INode> dlist = new ArrayList<INode>(refNum + deletedNodes.size());
private List<INode> loadDeletedList(final List<INodeReference> refList,
InputStream in, INodeDirectory dir, List<Long> deletedNodes,
List<Integer> deletedRefNodes)
throws IOException {
List<INode> dlist = new ArrayList<INode>(deletedRefNodes.size()
+ deletedNodes.size());
// load non-reference inodes
for (long deletedId : deletedNodes) {
INode deleted = fsDir.getInode(deletedId);
@ -223,13 +265,12 @@ private List<INode> loadDeletedList(InputStream in, INodeDirectory dir,
addToDeletedList(deleted, dir);
}
// load reference nodes in the deleted list
for (int r = 0; r < refNum; r++) {
INodeSection.INodeReference ref = INodeSection.INodeReference
.parseDelimitedFrom(in);
INodeReference refNode = loadINodeReference(ref, fsDir);
dlist.add(refNode);
addToDeletedList(refNode, dir);
for (int refId : deletedRefNodes) {
INodeReference deletedRef = refList.get(refId);
dlist.add(deletedRef);
addToDeletedList(deletedRef, dir);
}
Collections.sort(dlist, new Comparator<INode>() {
@Override
public int compare(INode n1, INode n2) {
@ -241,7 +282,7 @@ public int compare(INode n1, INode n2) {
/** Load DirectoryDiff list for a directory with snapshot feature */
private void loadDirectoryDiffList(InputStream in, INodeDirectory dir,
int size) throws IOException {
int size, final List<INodeReference> refList) throws IOException {
if (!dir.isWithSnapshot()) {
dir.addSnapshotFeature(null);
}
@ -284,8 +325,8 @@ private void loadDirectoryDiffList(InputStream in, INodeDirectory dir,
List<INode> clist = loadCreatedList(in, dir,
diffInPb.getCreatedListSize());
// load deleted list
List<INode> dlist = loadDeletedList(in, dir,
diffInPb.getNumOfDeletedRef(), diffInPb.getDeletedINodeList());
List<INode> dlist = loadDeletedList(refList, in, dir,
diffInPb.getDeletedINodeList(), diffInPb.getDeletedINodeRefList());
// create the directory diff
DirectoryDiff diff = new DirectoryDiff(snapshotId, copy, null,
childrenSize, clist, dlist, useRoot);
@ -304,7 +345,8 @@ public final static class Saver {
private final SaveNamespaceContext context;
public Saver(FSImageFormatProtobuf.Saver parent,
FileSummary.Builder headers, SaveNamespaceContext context, FSNamesystem fsn) {
FileSummary.Builder headers, SaveNamespaceContext context,
FSNamesystem fsn) {
this.parent = parent;
this.headers = headers;
this.context = context;
@ -349,12 +391,42 @@ public void serializeSnapshotSection(OutputStream out) throws IOException {
parent.commitSection(headers, FSImageFormatProtobuf.SectionName.SNAPSHOT);
}
/**
* This can only be called after serializing both INode_Dir and SnapshotDiff
*/
public void serializeINodeReferenceSection(OutputStream out)
throws IOException {
final List<INodeReference> refList = parent.getSaverContext()
.getRefList();
for (INodeReference ref : refList) {
INodeReferenceSection.INodeReference.Builder rb = buildINodeReference(ref);
rb.build().writeDelimitedTo(out);
}
parent.commitSection(headers, SectionName.INODE_REFRENCE);
}
private INodeReferenceSection.INodeReference.Builder buildINodeReference(
INodeReference ref) throws IOException {
INodeReferenceSection.INodeReference.Builder rb =
INodeReferenceSection.INodeReference.newBuilder().
setReferredId(ref.getId());
if (ref instanceof WithName) {
rb.setLastSnapshotId(((WithName) ref).getLastSnapshotId()).setName(
ByteString.copyFrom(ref.getLocalNameBytes()));
} else if (ref instanceof DstReference) {
rb.setDstSnapshotId(((DstReference) ref).getDstSnapshotId());
}
return rb;
}
/**
* save all the snapshot diff to fsimage
*/
public void serializeSnapshotDiffSection(OutputStream out)
throws IOException {
INodeMap inodesMap = fsn.getFSDirectory().getINodeMap();
final List<INodeReference> refList = parent.getSaverContext()
.getRefList();
int i = 0;
Iterator<INodeWithAdditionalFields> iter = inodesMap.getMapIterator();
while (iter.hasNext()) {
@ -362,7 +434,7 @@ public void serializeSnapshotDiffSection(OutputStream out)
if (inode.isFile()) {
serializeFileDiffList(inode.asFile(), out);
} else if (inode.isDirectory()) {
serializeDirDiffList(inode.asDirectory(), out);
serializeDirDiffList(inode.asDirectory(), refList, out);
}
++i;
if (i % FSImageFormatProtobuf.Saver.CHECK_CANCEL_INTERVAL == 0) {
@ -397,22 +469,18 @@ private void serializeFileDiffList(INodeFile file, OutputStream out)
}
}
private void saveCreatedDeletedList(List<INode> created,
List<INodeReference> deletedRefs, OutputStream out) throws IOException {
private void saveCreatedList(List<INode> created, OutputStream out)
throws IOException {
// local names of the created list member
for (INode c : created) {
SnapshotDiffSection.CreatedListEntry.newBuilder()
.setName(ByteString.copyFrom(c.getLocalNameBytes())).build()
.writeDelimitedTo(out);
}
// reference nodes in deleted list
for (INodeReference ref : deletedRefs) {
INodeSection.INodeReference.Builder rb = buildINodeReference(ref);
rb.build().writeDelimitedTo(out);
}
}
private void serializeDirDiffList(INodeDirectory dir, OutputStream out)
private void serializeDirDiffList(INodeDirectory dir,
final List<INodeReference> refList, OutputStream out)
throws IOException {
DirectoryWithSnapshotFeature sf = dir.getDirectoryWithSnapshotFeature();
if (sf != null) {
@ -438,17 +506,16 @@ private void serializeDirDiffList(INodeDirectory dir, OutputStream out)
.getList(ListType.CREATED);
db.setCreatedListSize(created.size());
List<INode> deleted = diff.getChildrenDiff().getList(ListType.DELETED);
List<INodeReference> refs = new ArrayList<INodeReference>();
for (INode d : deleted) {
if (d.isReference()) {
refs.add(d.asReference());
refList.add(d.asReference());
db.addDeletedINodeRef(refList.size() - 1);
} else {
db.addDeletedINode(d.getId());
}
}
db.setNumOfDeletedRef(refs.size());
db.build().writeDelimitedTo(out);
saveCreatedDeletedList(created, refs, out);
saveCreatedList(created, out);
}
}
}

View File

@ -109,6 +109,7 @@
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Charsets;
import com.sun.jersey.spi.container.ResourceFilters;
@ -162,9 +163,10 @@ private void init(final UserGroupInformation ugi,
response.setContentType(null);
}
@VisibleForTesting
static DatanodeInfo chooseDatanode(final NameNode namenode,
final String path, final HttpOpParam.Op op, final long openOffset,
final long blocksize, final Configuration conf) throws IOException {
final long blocksize) throws IOException {
final BlockManager bm = namenode.getNamesystem().getBlockManager();
if (op == PutOpParam.Op.CREATE) {
@ -203,7 +205,7 @@ static DatanodeInfo chooseDatanode(final NameNode namenode,
final LocatedBlocks locations = np.getBlockLocations(path, offset, 1);
final int count = locations.locatedBlockCount();
if (count > 0) {
return JspHelper.bestNode(locations.get(0).getLocations(), false, conf);
return bestNode(locations.get(0).getLocations());
}
}
}
@ -212,13 +214,26 @@ static DatanodeInfo chooseDatanode(final NameNode namenode,
).chooseRandom(NodeBase.ROOT);
}
/**
* Choose the datanode to redirect the request. Note that the nodes have been
* sorted based on availability and network distances, thus it is sufficient
* to return the first element of the node here.
*/
private static DatanodeInfo bestNode(DatanodeInfo[] nodes) throws IOException {
if (nodes.length == 0 || nodes[0].isDecommissioned()) {
throw new IOException("No active nodes contain this block");
}
return nodes[0];
}
private Token<? extends TokenIdentifier> generateDelegationToken(
final NameNode namenode, final UserGroupInformation ugi,
final String renewer) throws IOException {
final Credentials c = DelegationTokenSecretManager.createCredentials(
namenode, ugi, renewer != null? renewer: ugi.getShortUserName());
final Token<? extends TokenIdentifier> t = c.getAllTokens().iterator().next();
Text kind = request.getScheme().equals("http") ? WebHdfsFileSystem.TOKEN_KIND : SWebHdfsFileSystem.TOKEN_KIND;
Text kind = request.getScheme().equals("http") ? WebHdfsFileSystem.TOKEN_KIND
: SWebHdfsFileSystem.TOKEN_KIND;
t.setKind(kind);
return t;
}
@ -229,9 +244,8 @@ private URI redirectURI(final NameNode namenode,
final String path, final HttpOpParam.Op op, final long openOffset,
final long blocksize,
final Param<?, ?>... parameters) throws URISyntaxException, IOException {
final Configuration conf = (Configuration)context.getAttribute(JspHelper.CURRENT_CONF);
final DatanodeInfo dn = chooseDatanode(namenode, path, op, openOffset,
blocksize, conf);
blocksize);
final String delegationQuery;
if (!UserGroupInformation.isSecurityEnabled()) {

View File

@ -36,6 +36,7 @@
import org.apache.hadoop.hdfs.server.namenode.FSImageUtil;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeDirectorySection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeReferenceSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INode;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeDirectory;
@ -79,6 +80,7 @@ final class LsrPBImage {
private String[] stringTable;
private HashMap<Long, INodeSection.INode> inodes = Maps.newHashMap();
private HashMap<Long, long[]> dirmap = Maps.newHashMap();
private ArrayList<INodeReferenceSection.INodeReference> refList = Lists.newArrayList();
public LsrPBImage(Configuration conf, PrintWriter out) {
this.conf = conf;
@ -125,6 +127,9 @@ public int compare(FileSummary.Section s1, FileSummary.Section s2) {
case INODE:
loadINodeSection(is);
break;
case INODE_REFRENCE:
loadINodeReferenceSection(is);
break;
case INODE_DIR:
loadINodeDirectorySection(is);
break;
@ -202,14 +207,26 @@ private void loadINodeDirectorySection(InputStream in) throws IOException {
if (e == null) {
break;
}
long[] l = new long[e.getChildrenCount()];
for (int i = 0; i < l.length; ++i) {
long[] l = new long[e.getChildrenCount() + e.getRefChildrenCount()];
for (int i = 0; i < e.getChildrenCount(); ++i) {
l[i] = e.getChildren(i);
}
dirmap.put(e.getParent(), l);
for (int i = 0; i < e.getNumOfRef(); i++) {
INodeSection.INodeReference.parseDelimitedFrom(in);
for (int i = e.getChildrenCount(); i < l.length; i++) {
int refId = e.getRefChildren(i - e.getChildrenCount());
l[i] = refList.get(refId).getReferredId();
}
dirmap.put(e.getParent(), l);
}
}
private void loadINodeReferenceSection(InputStream in) throws IOException {
while (true) {
INodeReferenceSection.INodeReference e = INodeReferenceSection
.INodeReference.parseDelimitedFrom(in);
if (e == null) {
break;
}
refList.add(e);
}
}

View File

@ -43,6 +43,7 @@
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeDirectory;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INodeSymlink;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeReferenceSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.NameSystemSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection;
@ -132,6 +133,9 @@ public int compare(FileSummary.Section s1, FileSummary.Section s2) {
case INODE:
dumpINodeSection(is);
break;
case INODE_REFRENCE:
dumpINodeReferenceSection(is);
break;
case INODE_DIR:
dumpINodeDirectorySection(is);
break;
@ -227,18 +231,27 @@ private void dumpINodeDirectorySection(InputStream in) throws IOException {
for (long id : e.getChildrenList()) {
o("inode", id);
}
for (int i = 0; i < e.getNumOfRef(); i++) {
INodeSection.INodeReference r = INodeSection.INodeReference
.parseDelimitedFrom(in);
dumpINodeReference(r);
for (int refId : e.getRefChildrenList()) {
o("inodereference-index", refId);
}
out.print("</directory>\n");
}
out.print("</INodeDirectorySection>\n");
}
private void dumpINodeReference(INodeSection.INodeReference r) {
private void dumpINodeReferenceSection(InputStream in) throws IOException {
out.print("<INodeReferenceSection>");
while (true) {
INodeReferenceSection.INodeReference e = INodeReferenceSection
.INodeReference.parseDelimitedFrom(in);
if (e == null) {
break;
}
dumpINodeReference(e);
}
}
private void dumpINodeReference(INodeReferenceSection.INodeReference r) {
out.print("<ref>");
o("referredId", r.getReferredId()).o("name", r.getName().toStringUtf8())
.o("dstSnapshotId", r.getDstSnapshotId())
@ -362,10 +375,15 @@ private void dumpSnapshotDiffSection(InputStream in) throws IOException {
o("name", ce.getName().toStringUtf8());
out.print("</created>\n");
}
for (int j = 0; j < d.getNumOfDeletedRef(); ++j) {
INodeSection.INodeReference r = INodeSection.INodeReference
.parseDelimitedFrom(in);
dumpINodeReference(r);
for (long did : d.getDeletedINodeList()) {
out.print("<deleted>");
o("inode", did);
out.print("</deleted>\n");
}
for (int dRefid : d.getDeletedINodeRefList()) {
out.print("<deleted>");
o("inodereference-index", dRefid);
out.print("</deleted>\n");
}
out.print("</dirdiff>\n");
}

View File

@ -132,17 +132,6 @@ message INodeSection {
optional bytes target = 2;
}
message INodeReference {
// id of the referred inode
optional uint64 referredId = 1;
// local name recorded in WithName
optional bytes name = 2;
// recorded in DstReference
optional uint32 dstSnapshotId = 3;
// recorded in WithName
optional uint32 lastSnapshotId = 4;
}
message INode {
enum Type {
FILE = 1;
@ -183,13 +172,28 @@ message FilesUnderConstructionSection {
message INodeDirectorySection {
message DirEntry {
optional uint64 parent = 1;
// children that are not reference nodes
repeated uint64 children = 2 [packed = true];
optional uint64 numOfRef = 3;
// repeated INodeReference...
// children that are reference nodes, each element is a reference node id
repeated uint32 refChildren = 3 [packed = true];
}
// repeated DirEntry, ended at the boundary of the section.
}
message INodeReferenceSection {
message INodeReference {
// id of the referred inode
optional uint64 referredId = 1;
// local name recorded in WithName
optional bytes name = 2;
// recorded in DstReference
optional uint32 dstSnapshotId = 3;
// recorded in WithName
optional uint32 lastSnapshotId = 4;
}
// repeated INodeReference...
}
/**
* This section records the information about snapshot
* NAME: SNAPSHOT
@ -224,10 +228,10 @@ message SnapshotDiffSection {
optional bytes name = 4;
optional INodeSection.INodeDirectory snapshotCopy = 5;
optional uint32 createdListSize = 6;
optional uint32 numOfDeletedRef = 7; // number of reference nodes in deleted list
repeated uint64 deletedINode = 8 [packed = true]; // id of deleted inode
repeated uint64 deletedINode = 7 [packed = true]; // id of deleted inodes
// id of reference nodes in the deleted list
repeated uint32 deletedINodeRef = 8 [packed = true];
// repeated CreatedListEntry (size is specified by createdListSize)
// repeated INodeReference (reference inodes in deleted list)
}
message FileDiff {

View File

@ -161,6 +161,16 @@
</description>
</property>
<property>
<name>dfs.client.cached.conn.retry</name>
<value>3</value>
<description>The number of times the HDFS client will pull a socket from the
cache. Once this number is exceeded, the client will try to create a new
socket.
</description>
</property>
<property>
<name>dfs.https.server.keystore.resource</name>
<value>ssl-server.xml</value>
@ -1499,6 +1509,26 @@
</description>
</property>
<property>
<name>dfs.client.mmap.retry.timeout.ms</name>
<value>300000</value>
<description>
The minimum amount of time that we will wait before retrying a failed mmap
operation.
</description>
</property>
<property>
<name>dfs.client.short.circuit.replica.stale.threshold.ms</name>
<value>3000000</value>
<description>
The maximum amount of time that we will consider a short-circuit replica to
be valid, if there is no communication from the DataNode. After this time
has elapsed, we will re-fetch the short-circuit replica even if it is in
the cache.
</description>
</property>
<property>
<name>dfs.namenode.path.based.cache.block.map.allocation.percent</name>
<value>0.25</value>
@ -1628,4 +1658,15 @@
</description>
</property>
<property>
<name>dfs.client.context</name>
<value>default</value>
<description>
The name of the DFSClient context that we should use. Clients that share
a context share a socket cache and short-circuit cache, among other things.
You should only change this if you don't want to share with another set of
threads.
</description>
</property>
</configuration>

View File

@ -28,32 +28,39 @@
import java.util.Random;
import org.apache.commons.lang.SystemUtils;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.ExtendedBlockId;
import org.apache.hadoop.hdfs.ClientContext;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.DistributedFileSystem;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitCache.CacheVisitor;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.ByteBufferPool;
import org.apache.hadoop.io.ElasticByteBufferPool;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.BeforeClass;
import org.junit.Test;
import java.util.Map;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
@ -250,17 +257,39 @@ public void testZeroCopyReadsNoFallback() throws Exception {
}
}
private static class CountingVisitor
implements ClientMmapManager.ClientMmapVisitor {
int count = 0;
private static class CountingVisitor implements CacheVisitor {
private final int expectedNumOutstandingMmaps;
private final int expectedNumReplicas;
private final int expectedNumEvictable;
private final int expectedNumMmapedEvictable;
@Override
public void accept(ClientMmap mmap) {
count++;
CountingVisitor(int expectedNumOutstandingMmaps,
int expectedNumReplicas, int expectedNumEvictable,
int expectedNumMmapedEvictable) {
this.expectedNumOutstandingMmaps = expectedNumOutstandingMmaps;
this.expectedNumReplicas = expectedNumReplicas;
this.expectedNumEvictable = expectedNumEvictable;
this.expectedNumMmapedEvictable = expectedNumMmapedEvictable;
}
public void reset() {
count = 0;
@Override
public void visit(int numOutstandingMmaps,
Map<ExtendedBlockId, ShortCircuitReplica> replicas,
Map<ExtendedBlockId, InvalidToken> failedLoads,
Map<Long, ShortCircuitReplica> evictable,
Map<Long, ShortCircuitReplica> evictableMmapped) {
if (expectedNumOutstandingMmaps >= 0) {
Assert.assertEquals(expectedNumOutstandingMmaps, numOutstandingMmaps);
}
if (expectedNumReplicas >= 0) {
Assert.assertEquals(expectedNumReplicas, replicas.size());
}
if (expectedNumEvictable >= 0) {
Assert.assertEquals(expectedNumEvictable, evictable.size());
}
if (expectedNumMmapedEvictable >= 0) {
Assert.assertEquals(expectedNumMmapedEvictable, evictableMmapped.size());
}
}
}
@ -271,105 +300,98 @@ public void testZeroCopyMmapCache() throws Exception {
final Path TEST_PATH = new Path("/a");
final int TEST_FILE_LENGTH = 16385;
final int RANDOM_SEED = 23453;
final String CONTEXT = "testZeroCopyMmapCacheContext";
FSDataInputStream fsIn = null;
ByteBuffer results[] = { null, null, null, null, null };
ByteBuffer results[] = { null, null, null, null };
DistributedFileSystem fs = null;
conf.set(DFSConfigKeys.DFS_CLIENT_CONTEXT, CONTEXT);
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
fs = cluster.getFileSystem();
DFSTestUtil.createFile(fs, TEST_PATH,
TEST_FILE_LENGTH, (short)1, RANDOM_SEED);
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
fs = cluster.getFileSystem();
DFSTestUtil.createFile(fs, TEST_PATH,
TEST_FILE_LENGTH, (short)1, RANDOM_SEED);
try {
DFSTestUtil.waitReplication(fs, TEST_PATH, (short)1);
} catch (InterruptedException e) {
Assert.fail("unexpected InterruptedException during " +
"waitReplication: " + e);
} catch (TimeoutException e) {
Assert.fail("unexpected TimeoutException during " +
"waitReplication: " + e);
}
fsIn = fs.open(TEST_PATH);
byte original[] = new byte[TEST_FILE_LENGTH];
IOUtils.readFully(fsIn, original, 0, TEST_FILE_LENGTH);
fsIn.close();
fsIn = fs.open(TEST_PATH);
final ClientMmapManager mmapManager = fs.getClient().getMmapManager();
final CountingVisitor countingVisitor = new CountingVisitor();
mmapManager.visitMmaps(countingVisitor);
Assert.assertEquals(0, countingVisitor.count);
mmapManager.visitEvictable(countingVisitor);
Assert.assertEquals(0, countingVisitor.count);
results[0] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
fsIn.seek(0);
results[1] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
mmapManager.visitMmaps(countingVisitor);
Assert.assertEquals(1, countingVisitor.count);
countingVisitor.reset();
mmapManager.visitEvictable(countingVisitor);
Assert.assertEquals(0, countingVisitor.count);
countingVisitor.reset();
// The mmaps should be of the first block of the file.
final ExtendedBlock firstBlock = DFSTestUtil.getFirstBlock(fs, TEST_PATH);
mmapManager.visitMmaps(new ClientMmapManager.ClientMmapVisitor() {
@Override
public void accept(ClientMmap mmap) {
Assert.assertEquals(firstBlock, mmap.getBlock());
}
});
// Read more blocks.
results[2] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
results[3] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
try {
results[4] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
Assert.fail("expected UnsupportedOperationException");
} catch (UnsupportedOperationException e) {
// expected
}
// we should have 3 mmaps, 0 evictable
mmapManager.visitMmaps(countingVisitor);
Assert.assertEquals(3, countingVisitor.count);
countingVisitor.reset();
mmapManager.visitEvictable(countingVisitor);
Assert.assertEquals(0, countingVisitor.count);
// After we close the cursors, the mmaps should be evictable for
// a brief period of time. Then, they should be closed (we're
// using a very quick timeout)
for (ByteBuffer buffer : results) {
if (buffer != null) {
fsIn.releaseBuffer(buffer);
}
}
GenericTestUtils.waitFor(new Supplier<Boolean>() {
public Boolean get() {
countingVisitor.reset();
try {
mmapManager.visitEvictable(countingVisitor);
} catch (InterruptedException e) {
e.printStackTrace();
return false;
}
return (0 == countingVisitor.count);
}
}, 10, 10000);
countingVisitor.reset();
mmapManager.visitMmaps(countingVisitor);
Assert.assertEquals(0, countingVisitor.count);
} finally {
if (fsIn != null) fsIn.close();
if (fs != null) fs.close();
if (cluster != null) cluster.shutdown();
DFSTestUtil.waitReplication(fs, TEST_PATH, (short)1);
} catch (InterruptedException e) {
Assert.fail("unexpected InterruptedException during " +
"waitReplication: " + e);
} catch (TimeoutException e) {
Assert.fail("unexpected TimeoutException during " +
"waitReplication: " + e);
}
fsIn = fs.open(TEST_PATH);
byte original[] = new byte[TEST_FILE_LENGTH];
IOUtils.readFully(fsIn, original, 0, TEST_FILE_LENGTH);
fsIn.close();
fsIn = fs.open(TEST_PATH);
final ShortCircuitCache cache = ClientContext.get(
CONTEXT, new DFSClient.Conf(conf)). getShortCircuitCache();
cache.accept(new CountingVisitor(0, 5, 5, 0));
results[0] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
fsIn.seek(0);
results[1] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
// The mmap should be of the first block of the file.
final ExtendedBlock firstBlock =
DFSTestUtil.getFirstBlock(fs, TEST_PATH);
cache.accept(new CacheVisitor() {
@Override
public void visit(int numOutstandingMmaps,
Map<ExtendedBlockId, ShortCircuitReplica> replicas,
Map<ExtendedBlockId, InvalidToken> failedLoads,
Map<Long, ShortCircuitReplica> evictable,
Map<Long, ShortCircuitReplica> evictableMmapped) {
ShortCircuitReplica replica = replicas.get(
new ExtendedBlockId(firstBlock.getBlockId(), firstBlock.getBlockPoolId()));
Assert.assertNotNull(replica);
Assert.assertTrue(replica.hasMmap());
// The replica should not yet be evictable, since we have it open.
Assert.assertNull(replica.getEvictableTimeNs());
}
});
// Read more blocks.
results[2] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
results[3] = fsIn.read(null, 4096,
EnumSet.of(ReadOption.SKIP_CHECKSUMS));
// we should have 3 mmaps, 1 evictable
cache.accept(new CountingVisitor(3, 5, 2, 0));
// After we close the cursors, the mmaps should be evictable for
// a brief period of time. Then, they should be closed (we're
// using a very quick timeout)
for (ByteBuffer buffer : results) {
if (buffer != null) {
fsIn.releaseBuffer(buffer);
}
}
fsIn.close();
GenericTestUtils.waitFor(new Supplier<Boolean>() {
public Boolean get() {
final MutableBoolean finished = new MutableBoolean(false);
cache.accept(new CacheVisitor() {
@Override
public void visit(int numOutstandingMmaps,
Map<ExtendedBlockId, ShortCircuitReplica> replicas,
Map<ExtendedBlockId, InvalidToken> failedLoads,
Map<Long, ShortCircuitReplica> evictable,
Map<Long, ShortCircuitReplica> evictableMmapped) {
finished.setValue(evictableMmapped.isEmpty());
}
});
return finished.booleanValue();
}
}, 10, 60000);
cache.accept(new CountingVisitor(0, -1, -1, -1));
fs.close();
cluster.shutdown();
}
/**

View File

@ -21,6 +21,7 @@
import java.io.IOException;
import java.security.PrivilegedExceptionAction;
import java.util.UUID;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
@ -1175,4 +1176,32 @@ public void testReservedHdfsPathsOnFS() throws Exception {
public void testReservedHdfsPathsOnFC() throws Exception {
testOnFileContext(new TestReservedHdfsPaths());
}
/**
* Test trying to glob the root. Regression test for HDFS-5888.
**/
private static class TestGlobRoot implements FSTestWrapperGlobTest {
public void run(FSTestWrapper wrap, FSTestWrapper unprivilegedWrap,
FileSystem fs, FileContext fc) throws Exception {
final Path rootPath = new Path("/");
FileStatus oldRootStatus = wrap.getFileStatus(rootPath);
String newOwner = UUID.randomUUID().toString();
wrap.setOwner(new Path("/"), newOwner, null);
FileStatus[] status =
wrap.globStatus(rootPath, new AcceptAllPathFilter());
Assert.assertEquals(1, status.length);
Assert.assertEquals(newOwner, status[0].getOwner());
wrap.setOwner(new Path("/"), oldRootStatus.getOwner(), null);
}
}
@Test
public void testGlobRootOnFS() throws Exception {
testOnFileSystem(new TestGlobRoot());
}
@Test
public void testGlobRootOnFC() throws Exception {
testOnFileContext(new TestGlobRoot());
}
}

View File

@ -28,8 +28,12 @@
import java.util.List;
import java.util.Random;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -38,6 +42,8 @@
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.net.NetUtils;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
/**
* A helper class to setup the cluster, and get to BlockReader and DataNode for a block.
@ -141,22 +147,54 @@ public void readAndCheckEOS(BlockReader reader, int length, boolean expectEof)
*/
public BlockReader getBlockReader(LocatedBlock testBlock, int offset, int lenToRead)
throws IOException {
return getBlockReader(cluster, testBlock, offset, lenToRead);
}
/**
* Get a BlockReader for the given block.
*/
public static BlockReader getBlockReader(MiniDFSCluster cluster,
LocatedBlock testBlock, int offset, int lenToRead) throws IOException {
InetSocketAddress targetAddr = null;
Socket sock = null;
ExtendedBlock block = testBlock.getBlock();
DatanodeInfo[] nodes = testBlock.getLocations();
targetAddr = NetUtils.createSocketAddr(nodes[0].getXferAddr());
sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
sock.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
return BlockReaderFactory.newBlockReader(
new DFSClient.Conf(conf),
targetAddr.toString()+ ":" + block.getBlockId(), block,
testBlock.getBlockToken(),
offset, lenToRead,
true, "BlockReaderTestUtil", TcpPeerServer.peerFromSocket(sock),
nodes[0], null, null, null, false, CachingStrategy.newDefaultStrategy());
final DistributedFileSystem fs = cluster.getFileSystem();
return new BlockReaderFactory(fs.getClient().getConf()).
setInetSocketAddress(targetAddr).
setBlock(block).
setFileName(targetAddr.toString()+ ":" + block.getBlockId()).
setBlockToken(testBlock.getBlockToken()).
setStartOffset(offset).
setLength(lenToRead).
setVerifyChecksum(true).
setClientName("BlockReaderTestUtil").
setDatanodeInfo(nodes[0]).
setClientCacheContext(ClientContext.getFromConf(fs.getConf())).
setCachingStrategy(CachingStrategy.newDefaultStrategy()).
setConfiguration(fs.getConf()).
setAllowShortCircuitLocalReads(true).
setRemotePeerFactory(new RemotePeerFactory() {
@Override
public Peer newConnectedPeer(InetSocketAddress addr)
throws IOException {
Peer peer = null;
Socket sock = NetUtils.
getDefaultSocketFactory(fs.getConf()).createSocket();
try {
sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
peer = TcpPeerServer.peerFromSocket(sock);
} finally {
if (peer == null) {
IOUtils.closeQuietly(sock);
}
}
return peer;
}
}).
build();
}
/**
@ -167,4 +205,13 @@ public DataNode getDataNode(LocatedBlock testBlock) {
int ipcport = nodes[0].getIpcPort();
return cluster.getDataNode(ipcport);
}
}
public static void enableBlockReaderFactoryTracing() {
LogManager.getLogger(BlockReaderFactory.class.getName()).setLevel(
Level.TRACE);
LogManager.getLogger(ShortCircuitCache.class.getName()).setLevel(
Level.TRACE);
LogManager.getLogger(ShortCircuitReplica.class.getName()).setLevel(
Level.TRACE);
}
}

View File

@ -187,10 +187,26 @@ public void createFiles(FileSystem fs, String topdir,
}
}
public static String readFile(FileSystem fs, Path fileName) throws IOException {
public static String readFile(FileSystem fs, Path fileName)
throws IOException {
byte buf[] = readFileBuffer(fs, fileName);
return new String(buf, 0, buf.length);
}
public static byte[] readFileBuffer(FileSystem fs, Path fileName)
throws IOException {
ByteArrayOutputStream os = new ByteArrayOutputStream();
IOUtils.copyBytes(fs.open(fileName), os, 1024, true);
return os.toString();
try {
FSDataInputStream in = fs.open(fileName);
try {
IOUtils.copyBytes(fs.open(fileName), os, 1024, true);
return os.toByteArray();
} finally {
in.close();
}
} finally {
os.close();
}
}
public static void createFile(FileSystem fs, Path fileName, long fileLen,
@ -232,6 +248,13 @@ public static void createFile(FileSystem fs, Path fileName, int bufferLen,
}
}
public static byte[] calculateFileContentsFromSeed(long seed, int length) {
Random rb = new Random(seed);
byte val[] = new byte[length];
rb.nextBytes(val);
return val;
}
/** check if the files have been copied correctly. */
public boolean checkFiles(FileSystem fs, String topdir) throws IOException {
Path root = new Path(topdir);
@ -551,8 +574,12 @@ public void cleanup(FileSystem fs, String topdir) throws IOException {
public static ExtendedBlock getFirstBlock(FileSystem fs, Path path) throws IOException {
HdfsDataInputStream in = (HdfsDataInputStream) fs.open(path);
in.readByte();
return in.getCurrentBlock();
try {
in.readByte();
return in.getCurrentBlock();
} finally {
in.close();
}
}
public static List<LocatedBlock> getAllBlocks(FSDataInputStream in)

View File

@ -0,0 +1,285 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.File;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.LogFactory;
import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitReplicaInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import com.google.common.util.concurrent.Uninterruptibles;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_BLOCK_SIZE_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC;
public class TestBlockReaderFactory {
static final Log LOG = LogFactory.getLog(TestBlockReaderFactory.class);
@Before
public void init() {
DomainSocket.disableBindPathValidation();
}
@After
public void cleanup() {
DFSInputStream.tcpReadsDisabledForTesting = false;
BlockReaderFactory.createShortCircuitReplicaInfoCallback = null;
}
private static Configuration createShortCircuitConf(String testName,
TemporarySocketDirectory sockDir) {
Configuration conf = new Configuration();
conf.set(DFS_CLIENT_CONTEXT, testName);
conf.setLong(DFS_BLOCK_SIZE_KEY, 4096);
conf.set(DFS_DOMAIN_SOCKET_PATH_KEY, new File(sockDir.getDir(),
testName + "._PORT").getAbsolutePath());
conf.setBoolean(DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
false);
conf.setBoolean(DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, false);
return conf;
}
/**
* If we have a UNIX domain socket configured,
* and we have dfs.client.domain.socket.data.traffic set to true,
* and short-circuit access fails, we should still be able to pass
* data traffic over the UNIX domain socket. Test this.
*/
@Test(timeout=60000)
public void testFallbackFromShortCircuitToUnixDomainTraffic()
throws Exception {
DFSInputStream.tcpReadsDisabledForTesting = true;
TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
// The server is NOT configured with short-circuit local reads;
// the client is. Both support UNIX domain reads.
Configuration clientConf = createShortCircuitConf(
"testFallbackFromShortCircuitToUnixDomainTraffic", sockDir);
clientConf.setBoolean(DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, true);
Configuration serverConf = new Configuration(clientConf);
serverConf.setBoolean(DFS_CLIENT_READ_SHORTCIRCUIT_KEY, false);
MiniDFSCluster cluster =
new MiniDFSCluster.Builder(serverConf).numDataNodes(1).build();
cluster.waitActive();
FileSystem dfs = FileSystem.get(cluster.getURI(0), clientConf);
String TEST_FILE = "/test_file";
final int TEST_FILE_LEN = 8193;
final int SEED = 0xFADED;
DFSTestUtil.createFile(dfs, new Path(TEST_FILE), TEST_FILE_LEN,
(short)1, SEED);
byte contents[] = DFSTestUtil.readFileBuffer(dfs, new Path(TEST_FILE));
byte expected[] = DFSTestUtil.
calculateFileContentsFromSeed(SEED, TEST_FILE_LEN);
Assert.assertTrue(Arrays.equals(contents, expected));
cluster.shutdown();
sockDir.close();
}
/**
* Test the case where we have multiple threads waiting on the
* ShortCircuitCache delivering a certain ShortCircuitReplica.
*
* In this case, there should only be one call to
* createShortCircuitReplicaInfo. This one replica should be shared
* by all threads.
*/
@Test(timeout=60000)
public void testMultipleWaitersOnShortCircuitCache()
throws Exception {
final CountDownLatch latch = new CountDownLatch(1);
final AtomicBoolean creationIsBlocked = new AtomicBoolean(true);
final AtomicBoolean testFailed = new AtomicBoolean(false);
DFSInputStream.tcpReadsDisabledForTesting = true;
BlockReaderFactory.createShortCircuitReplicaInfoCallback =
new ShortCircuitCache.ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
Uninterruptibles.awaitUninterruptibly(latch);
if (!creationIsBlocked.compareAndSet(true, false)) {
Assert.fail("there were multiple calls to "
+ "createShortCircuitReplicaInfo. Only one was expected.");
}
return null;
}
};
TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
Configuration conf = createShortCircuitConf(
"testMultipleWaitersOnShortCircuitCache", sockDir);
MiniDFSCluster cluster =
new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
final DistributedFileSystem dfs = cluster.getFileSystem();
final String TEST_FILE = "/test_file";
final int TEST_FILE_LEN = 4000;
final int SEED = 0xFADED;
final int NUM_THREADS = 10;
DFSTestUtil.createFile(dfs, new Path(TEST_FILE), TEST_FILE_LEN,
(short)1, SEED);
Runnable readerRunnable = new Runnable() {
@Override
public void run() {
try {
byte contents[] = DFSTestUtil.readFileBuffer(dfs, new Path(TEST_FILE));
Assert.assertFalse(creationIsBlocked.get());
byte expected[] = DFSTestUtil.
calculateFileContentsFromSeed(SEED, TEST_FILE_LEN);
Assert.assertTrue(Arrays.equals(contents, expected));
} catch (Throwable e) {
LOG.error("readerRunnable error", e);
testFailed.set(true);
}
}
};
Thread threads[] = new Thread[NUM_THREADS];
for (int i = 0; i < NUM_THREADS; i++) {
threads[i] = new Thread(readerRunnable);
threads[i].start();
}
Thread.sleep(500);
latch.countDown();
for (int i = 0; i < NUM_THREADS; i++) {
Uninterruptibles.joinUninterruptibly(threads[i]);
}
cluster.shutdown();
sockDir.close();
Assert.assertFalse(testFailed.get());
}
/**
* Test the case where we have a failure to complete a short circuit read
* that occurs, and then later on, we have a success.
* Any thread waiting on a cache load should receive the failure (if it
* occurs); however, the failure result should not be cached. We want
* to be able to retry later and succeed.
*/
@Test(timeout=60000)
public void testShortCircuitCacheTemporaryFailure()
throws Exception {
BlockReaderTestUtil.enableBlockReaderFactoryTracing();
final AtomicBoolean replicaCreationShouldFail = new AtomicBoolean(true);
final AtomicBoolean testFailed = new AtomicBoolean(false);
DFSInputStream.tcpReadsDisabledForTesting = true;
BlockReaderFactory.createShortCircuitReplicaInfoCallback =
new ShortCircuitCache.ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
if (replicaCreationShouldFail.get()) {
// Insert a short delay to increase the chance that one client
// thread waits for the other client thread's failure via
// a condition variable.
Uninterruptibles.sleepUninterruptibly(2, TimeUnit.SECONDS);
return new ShortCircuitReplicaInfo();
}
return null;
}
};
TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
Configuration conf = createShortCircuitConf(
"testShortCircuitCacheTemporaryFailure", sockDir);
final MiniDFSCluster cluster =
new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
final DistributedFileSystem dfs = cluster.getFileSystem();
final String TEST_FILE = "/test_file";
final int TEST_FILE_LEN = 4000;
final int NUM_THREADS = 2;
final int SEED = 0xFADED;
final CountDownLatch gotFailureLatch = new CountDownLatch(NUM_THREADS);
final CountDownLatch shouldRetryLatch = new CountDownLatch(1);
DFSTestUtil.createFile(dfs, new Path(TEST_FILE), TEST_FILE_LEN,
(short)1, SEED);
Runnable readerRunnable = new Runnable() {
@Override
public void run() {
try {
// First time should fail.
List<LocatedBlock> locatedBlocks =
cluster.getNameNode().getRpcServer().getBlockLocations(
TEST_FILE, 0, TEST_FILE_LEN).getLocatedBlocks();
LocatedBlock lblock = locatedBlocks.get(0); // first block
BlockReader blockReader = null;
try {
blockReader = BlockReaderTestUtil.
getBlockReader(cluster, lblock, 0, TEST_FILE_LEN);
Assert.fail("expected getBlockReader to fail the first time.");
} catch (Throwable t) {
Assert.assertTrue("expected to see 'TCP reads were disabled " +
"for testing' in exception " + t, t.getMessage().contains(
"TCP reads were disabled for testing"));
} finally {
if (blockReader != null) blockReader.close(); // keep findbugs happy
}
gotFailureLatch.countDown();
shouldRetryLatch.await();
// Second time should succeed.
try {
blockReader = BlockReaderTestUtil.
getBlockReader(cluster, lblock, 0, TEST_FILE_LEN);
} catch (Throwable t) {
LOG.error("error trying to retrieve a block reader " +
"the second time.", t);
throw t;
} finally {
if (blockReader != null) blockReader.close();
}
} catch (Throwable t) {
LOG.error("getBlockReader failure", t);
testFailed.set(true);
}
}
};
Thread threads[] = new Thread[NUM_THREADS];
for (int i = 0; i < NUM_THREADS; i++) {
threads[i] = new Thread(readerRunnable);
threads[i].start();
}
gotFailureLatch.await();
replicaCreationShouldFail.set(false);
shouldRetryLatch.countDown();
for (int i = 0; i < NUM_THREADS; i++) {
Uninterruptibles.joinUninterruptibly(threads[i]);
}
cluster.shutdown();
sockDir.close();
Assert.assertFalse(testFailed.get());
}
}

View File

@ -30,13 +30,15 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.apache.hadoop.util.Time;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Assume;
@ -155,6 +157,8 @@ public void runBlockReaderLocalTest(BlockReaderLocalTest test,
File metaFile = MiniDFSCluster.getBlockMetadataFile(0, block);
DatanodeID datanodeID = cluster.getDataNodes().get(0).getDatanodeId();
ShortCircuitCache shortCircuitCache =
ClientContext.getFromConf(conf).getShortCircuitCache();
cluster.shutdown();
cluster = null;
test.setup(dataFile, checksum);
@ -164,16 +168,17 @@ public void runBlockReaderLocalTest(BlockReaderLocalTest test,
};
dataIn = streams[0];
metaIn = streams[1];
ExtendedBlockId key = new ExtendedBlockId(block.getBlockId(), block.getBlockPoolId());
ShortCircuitReplica replica = new ShortCircuitReplica(
key, dataIn, metaIn, shortCircuitCache, Time.now());
blockReaderLocal = new BlockReaderLocal.Builder(
new DFSClient.Conf(conf)).
setFilename(TEST_PATH.getName()).
setBlock(block).
setStreams(streams).
setShortCircuitReplica(replica).
setDatanodeID(datanodeID).
setCachingStrategy(new CachingStrategy(false, readahead)).
setVerifyChecksum(checksum).
setBlockMetadataHeader(BlockMetadataHeader.preadHeader(
metaIn.getChannel())).
build();
dataIn = null;
metaIn = null;

View File

@ -25,18 +25,8 @@
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.security.token.Token;
import org.junit.Assert;
import org.junit.Test;
import org.mockito.Matchers;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
/**
* This class tests the client connection caching in a single node
@ -48,30 +38,6 @@ public class TestConnCache {
static final int BLOCK_SIZE = 4096;
static final int FILE_SIZE = 3 * BLOCK_SIZE;
/**
* A mock Answer to remember the BlockReader used.
*
* It verifies that all invocation to DFSInputStream.getBlockReader()
* use the same peer.
*/
private class MockGetBlockReader implements Answer<RemoteBlockReader2> {
public RemoteBlockReader2 reader = null;
private Peer peer = null;
@Override
public RemoteBlockReader2 answer(InvocationOnMock invocation) throws Throwable {
RemoteBlockReader2 prevReader = reader;
reader = (RemoteBlockReader2) invocation.callRealMethod();
if (peer == null) {
peer = reader.getPeer();
} else if (prevReader != null) {
Assert.assertSame("DFSInputStream should use the same peer",
peer, reader.getPeer());
}
return reader;
}
}
/**
* (Optionally) seek to position, read and verify data.
*
@ -115,33 +81,29 @@ private void pread(DFSInputStream in,
* @throws Exception
*/
@Test
@SuppressWarnings("unchecked")
public void testReadFromOneDN() throws Exception {
BlockReaderTestUtil util = new BlockReaderTestUtil(1,
new HdfsConfiguration());
HdfsConfiguration configuration = new HdfsConfiguration();
// One of the goals of this test is to verify that we don't open more
// than one socket. So use a different client context, so that we
// get our own socket cache, rather than sharing with the other test
// instances. Also use a really long socket timeout so that nothing
// gets closed before we get around to checking the cache size at the end.
final String contextName = "testReadFromOneDNContext";
configuration.set(DFSConfigKeys.DFS_CLIENT_CONTEXT, contextName);
configuration.setLong(DFSConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY,
100000000L);
BlockReaderTestUtil util = new BlockReaderTestUtil(1, configuration);
final Path testFile = new Path("/testConnCache.dat");
byte authenticData[] = util.writeFile(testFile, FILE_SIZE / 1024);
DFSClient client = new DFSClient(
new InetSocketAddress("localhost",
util.getCluster().getNameNodePort()), util.getConf());
DFSInputStream in = Mockito.spy(client.open(testFile.toString()));
ClientContext cacheContext =
ClientContext.get(contextName, client.getConf());
DFSInputStream in = client.open(testFile.toString());
LOG.info("opened " + testFile.toString());
byte[] dataBuf = new byte[BLOCK_SIZE];
MockGetBlockReader answer = new MockGetBlockReader();
Mockito.doAnswer(answer).when(in).getBlockReader(
(InetSocketAddress) Matchers.anyObject(),
(DatanodeInfo) Matchers.anyObject(),
Matchers.anyString(),
(ExtendedBlock) Matchers.anyObject(),
(Token<BlockTokenIdentifier>) Matchers.anyObject(),
Matchers.anyLong(),
Matchers.anyLong(),
Matchers.anyInt(),
Matchers.anyBoolean(),
Matchers.anyString(),
(CachingStrategy)Matchers.anyObject());
// Initial read
pread(in, 0, dataBuf, 0, dataBuf.length, authenticData);
// Read again and verify that the socket is the same
@ -153,5 +115,8 @@ public void testReadFromOneDN() throws Exception {
pread(in, 64, dataBuf, 0, dataBuf.length / 2, authenticData);
in.close();
client.close();
Assert.assertEquals(1,
ClientContext.getFromConf(configuration).getPeerCache().size());
}
}

View File

@ -22,7 +22,7 @@
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_DATANODE_SOCKET_WRITE_TIMEOUT_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_CLIENT_CONTEXT;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
@ -86,21 +86,22 @@ public void testDatanodeRespectsKeepAliveTimeout() throws Exception {
// the datanode-side expiration time.
final long CLIENT_EXPIRY_MS = 60000L;
clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT, CLIENT_EXPIRY_MS);
clientConf.set(DFS_CLIENT_CONTEXT, "testDatanodeRespectsKeepAliveTimeout");
DistributedFileSystem fs =
(DistributedFileSystem)FileSystem.get(cluster.getURI(),
clientConf);
PeerCache peerCache = ClientContext.getFromConf(clientConf).getPeerCache();
DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
// Clients that write aren't currently re-used.
assertEquals(0, fs.dfs.peerCache.size());
assertEquals(0, peerCache.size());
assertXceiverCount(0);
// Reads the file, so we should get a
// cached socket, and should have an xceiver on the other side.
DFSTestUtil.readFile(fs, TEST_FILE);
assertEquals(1, fs.dfs.peerCache.size());
assertEquals(1, peerCache.size());
assertXceiverCount(1);
// Sleep for a bit longer than the keepalive timeout
@ -111,15 +112,13 @@ public void testDatanodeRespectsKeepAliveTimeout() throws Exception {
// The socket is still in the cache, because we don't
// notice that it's closed until we try to read
// from it again.
assertEquals(1, fs.dfs.peerCache.size());
assertEquals(1, peerCache.size());
// Take it out of the cache - reading should
// give an EOF.
Peer peer = fs.dfs.peerCache.get(dn.getDatanodeId(), false);
Peer peer = peerCache.get(dn.getDatanodeId(), false);
assertNotNull(peer);
assertEquals(-1, peer.getInputStream().read());
PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT,
DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT);
}
/**
@ -132,34 +131,33 @@ public void testClientResponsesKeepAliveTimeout() throws Exception {
// the datanode-side expiration time.
final long CLIENT_EXPIRY_MS = 10L;
clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT, CLIENT_EXPIRY_MS);
clientConf.set(DFS_CLIENT_CONTEXT, "testClientResponsesKeepAliveTimeout");
DistributedFileSystem fs =
(DistributedFileSystem)FileSystem.get(cluster.getURI(),
clientConf);
PeerCache peerCache = ClientContext.getFromConf(clientConf).getPeerCache();
DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
// Clients that write aren't currently re-used.
assertEquals(0, fs.dfs.peerCache.size());
assertEquals(0, peerCache.size());
assertXceiverCount(0);
// Reads the file, so we should get a
// cached socket, and should have an xceiver on the other side.
DFSTestUtil.readFile(fs, TEST_FILE);
assertEquals(1, fs.dfs.peerCache.size());
assertEquals(1, peerCache.size());
assertXceiverCount(1);
// Sleep for a bit longer than the client keepalive timeout.
Thread.sleep(CLIENT_EXPIRY_MS + 1);
// Taking out a peer which is expired should give a null.
Peer peer = fs.dfs.peerCache.get(dn.getDatanodeId(), false);
Peer peer = peerCache.get(dn.getDatanodeId(), false);
assertTrue(peer == null);
// The socket cache is now empty.
assertEquals(0, fs.dfs.peerCache.size());
PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT,
DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_DEFAULT);
assertEquals(0, peerCache.size());
}
/**
@ -174,7 +172,7 @@ public void testSlowReader() throws Exception {
final long CLIENT_EXPIRY_MS = 600000L;
Configuration clientConf = new Configuration(conf);
clientConf.setLong(DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY, CLIENT_EXPIRY_MS);
PeerCache.setInstance(DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT, CLIENT_EXPIRY_MS);
clientConf.set(DFS_CLIENT_CONTEXT, "testSlowReader");
DistributedFileSystem fs =
(DistributedFileSystem)FileSystem.get(cluster.getURI(),
clientConf);
@ -209,7 +207,12 @@ public Boolean get() {
@Test(timeout=30000)
public void testManyClosedSocketsInCache() throws Exception {
// Make a small file
DistributedFileSystem fs = cluster.getFileSystem();
Configuration clientConf = new Configuration(conf);
clientConf.set(DFS_CLIENT_CONTEXT, "testManyClosedSocketsInCache");
DistributedFileSystem fs =
(DistributedFileSystem)FileSystem.get(cluster.getURI(),
clientConf);
PeerCache peerCache = ClientContext.getFromConf(clientConf).getPeerCache();
DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
// Insert a bunch of dead sockets in the cache, by opening
@ -227,15 +230,14 @@ public void testManyClosedSocketsInCache() throws Exception {
IOUtils.cleanup(null, stms);
}
DFSClient client = ((DistributedFileSystem)fs).dfs;
assertEquals(5, client.peerCache.size());
assertEquals(5, peerCache.size());
// Let all the xceivers timeout
Thread.sleep(1500);
assertXceiverCount(0);
// Client side still has the sockets cached
assertEquals(5, client.peerCache.size());
assertEquals(5, peerCache.size());
// Reading should not throw an exception.
DFSTestUtil.readFile(fs, TEST_FILE);

View File

@ -53,7 +53,8 @@ public void testDisableCache() throws Exception {
FileSystem fsWithoutCache = FileSystem.newInstance(util.getConf());
try {
DFSTestUtil.readFile(fsWithoutCache, testFile);
assertEquals(0, ((DistributedFileSystem)fsWithoutCache).dfs.peerCache.size());
assertEquals(0, ((DistributedFileSystem)fsWithoutCache).
dfs.getClientContext().getPeerCache().size());
} finally {
fsWithoutCache.close();
util.shutdown();

View File

@ -1,126 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.Assert;
import org.junit.Test;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class TestFileInputStreamCache {
static final Log LOG = LogFactory.getLog(TestFileInputStreamCache.class);
@Test
public void testCreateAndDestroy() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(10, 1000);
cache.close();
}
private static class TestFileDescriptorPair {
TemporarySocketDirectory dir = new TemporarySocketDirectory();
FileInputStream fis[];
public TestFileDescriptorPair() throws IOException {
fis = new FileInputStream[2];
for (int i = 0; i < 2; i++) {
String name = dir.getDir() + "/file" + i;
FileOutputStream fos = new FileOutputStream(name);
fos.write(1);
fos.close();
fis[i] = new FileInputStream(name);
}
}
public FileInputStream[] getFileInputStreams() {
return fis;
}
public void close() throws IOException {
IOUtils.cleanup(LOG, fis);
dir.close();
}
public boolean compareWith(FileInputStream other[]) {
if ((other == null) || (fis == null)) {
return other == fis;
}
if (fis.length != other.length) return false;
for (int i = 0; i < fis.length; i++) {
if (fis[i] != other[i]) return false;
}
return true;
}
}
@Test
public void testAddAndRetrieve() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(1, 1000000);
DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8080, 9090, 7070, 6060);
ExtendedBlock block = new ExtendedBlock("poolid", 123);
TestFileDescriptorPair pair = new TestFileDescriptorPair();
cache.put(dnId, block, pair.getFileInputStreams());
FileInputStream fis[] = cache.get(dnId, block);
Assert.assertTrue(pair.compareWith(fis));
pair.close();
cache.close();
}
@Test
public void testExpiry() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(1, 10);
DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8080, 9090, 7070, 6060);
ExtendedBlock block = new ExtendedBlock("poolid", 123);
TestFileDescriptorPair pair = new TestFileDescriptorPair();
cache.put(dnId, block, pair.getFileInputStreams());
Thread.sleep(cache.getExpiryTimeMs() * 100);
FileInputStream fis[] = cache.get(dnId, block);
Assert.assertNull(fis);
pair.close();
cache.close();
}
@Test
public void testEviction() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(1, 10000000);
DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8080, 9090, 7070, 6060);
ExtendedBlock block = new ExtendedBlock("poolid", 123);
TestFileDescriptorPair pair = new TestFileDescriptorPair();
cache.put(dnId, block, pair.getFileInputStreams());
DatanodeID dnId2 = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8081, 9091, 7071, 6061);
TestFileDescriptorPair pair2 = new TestFileDescriptorPair();
cache.put(dnId2, block, pair2.getFileInputStreams());
FileInputStream fis[] = cache.get(dnId, block);
Assert.assertNull(fis);
FileInputStream fis2[] = cache.get(dnId2, block);
Assert.assertTrue(pair2.compareWith(fis2));
pair.close();
cache.close();
}
}

View File

@ -303,5 +303,6 @@ public void testGetFileStatusOnDir() throws Exception {
FileSystem.LOG.info("GOOD: getting an exception", ioe);
}
}
fs.delete(dir, true);
}
}

View File

@ -0,0 +1,346 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitCache.ShortCircuitReplicaCreator;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.client.ShortCircuitReplicaInfo;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.DataChecksum;
import org.apache.hadoop.util.Time;
import org.junit.Assert;
import org.junit.Test;
import com.google.common.base.Preconditions;
import com.google.common.base.Supplier;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
public class TestShortCircuitCache {
static final Log LOG = LogFactory.getLog(TestShortCircuitCache.class);
private static class TestFileDescriptorPair {
TemporarySocketDirectory dir = new TemporarySocketDirectory();
FileInputStream fis[];
public TestFileDescriptorPair() throws IOException {
fis = new FileInputStream[2];
for (int i = 0; i < 2; i++) {
String name = dir.getDir() + "/file" + i;
FileOutputStream fos = new FileOutputStream(name);
if (i == 0) {
// write 'data' file
fos.write(1);
} else {
// write 'metadata' file
BlockMetadataHeader header =
new BlockMetadataHeader((short)1,
DataChecksum.newDataChecksum(DataChecksum.Type.NULL, 4));
DataOutputStream dos = new DataOutputStream(fos);
BlockMetadataHeader.writeHeader(dos, header);
dos.close();
}
fos.close();
fis[i] = new FileInputStream(name);
}
}
public FileInputStream[] getFileInputStreams() {
return fis;
}
public void close() throws IOException {
IOUtils.cleanup(LOG, fis);
dir.close();
}
public boolean compareWith(FileInputStream data, FileInputStream meta) {
return ((data == fis[0]) && (meta == fis[1]));
}
}
private static class SimpleReplicaCreator
implements ShortCircuitReplicaCreator {
private final int blockId;
private final ShortCircuitCache cache;
private final TestFileDescriptorPair pair;
SimpleReplicaCreator(int blockId, ShortCircuitCache cache,
TestFileDescriptorPair pair) {
this.blockId = blockId;
this.cache = cache;
this.pair = pair;
}
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
try {
ExtendedBlockId key = new ExtendedBlockId(blockId, "test_bp1");
return new ShortCircuitReplicaInfo(
new ShortCircuitReplica(key,
pair.getFileInputStreams()[0], pair.getFileInputStreams()[1],
cache, Time.monotonicNow()));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
@Test(timeout=60000)
public void testCreateAndDestroy() throws Exception {
ShortCircuitCache cache =
new ShortCircuitCache(10, 1, 10, 1, 1, 10000);
cache.close();
}
@Test(timeout=60000)
public void testAddAndRetrieve() throws Exception {
final ShortCircuitCache cache =
new ShortCircuitCache(10, 10000000, 10, 10000000, 1, 10000);
final TestFileDescriptorPair pair = new TestFileDescriptorPair();
ShortCircuitReplicaInfo replicaInfo1 =
cache.fetchOrCreate(new ExtendedBlockId(123, "test_bp1"),
new SimpleReplicaCreator(123, cache, pair));
Preconditions.checkNotNull(replicaInfo1.getReplica());
Preconditions.checkState(replicaInfo1.getInvalidTokenException() == null);
pair.compareWith(replicaInfo1.getReplica().getDataStream(),
replicaInfo1.getReplica().getMetaStream());
ShortCircuitReplicaInfo replicaInfo2 =
cache.fetchOrCreate(new ExtendedBlockId(123, "test_bp1"),
new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
Assert.fail("expected to use existing entry.");
return null;
}
});
Preconditions.checkNotNull(replicaInfo2.getReplica());
Preconditions.checkState(replicaInfo2.getInvalidTokenException() == null);
Preconditions.checkState(replicaInfo1 == replicaInfo2);
pair.compareWith(replicaInfo2.getReplica().getDataStream(),
replicaInfo2.getReplica().getMetaStream());
replicaInfo1.getReplica().unref();
replicaInfo2.getReplica().unref();
// Even after the reference count falls to 0, we still keep the replica
// around for a while (we have configured the expiry period to be really,
// really long here)
ShortCircuitReplicaInfo replicaInfo3 =
cache.fetchOrCreate(
new ExtendedBlockId(123, "test_bp1"), new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
Assert.fail("expected to use existing entry.");
return null;
}
});
Preconditions.checkNotNull(replicaInfo3.getReplica());
Preconditions.checkState(replicaInfo3.getInvalidTokenException() == null);
replicaInfo3.getReplica().unref();
pair.close();
cache.close();
}
@Test(timeout=60000)
public void testExpiry() throws Exception {
final ShortCircuitCache cache =
new ShortCircuitCache(2, 1, 1, 10000000, 1, 10000);
final TestFileDescriptorPair pair = new TestFileDescriptorPair();
ShortCircuitReplicaInfo replicaInfo1 =
cache.fetchOrCreate(
new ExtendedBlockId(123, "test_bp1"), new SimpleReplicaCreator(123, cache, pair));
Preconditions.checkNotNull(replicaInfo1.getReplica());
Preconditions.checkState(replicaInfo1.getInvalidTokenException() == null);
pair.compareWith(replicaInfo1.getReplica().getDataStream(),
replicaInfo1.getReplica().getMetaStream());
replicaInfo1.getReplica().unref();
final MutableBoolean triedToCreate = new MutableBoolean(false);
do {
Thread.sleep(10);
ShortCircuitReplicaInfo replicaInfo2 =
cache.fetchOrCreate(
new ExtendedBlockId(123, "test_bp1"), new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
triedToCreate.setValue(true);
return null;
}
});
if ((replicaInfo2 != null) && (replicaInfo2.getReplica() != null)) {
replicaInfo2.getReplica().unref();
}
} while (triedToCreate.isFalse());
cache.close();
}
@Test(timeout=60000)
public void testEviction() throws Exception {
final ShortCircuitCache cache =
new ShortCircuitCache(2, 10000000, 1, 10000000, 1, 10000);
final TestFileDescriptorPair pairs[] = new TestFileDescriptorPair[] {
new TestFileDescriptorPair(),
new TestFileDescriptorPair(),
new TestFileDescriptorPair(),
};
ShortCircuitReplicaInfo replicaInfos[] = new ShortCircuitReplicaInfo[] {
null,
null,
null
};
for (int i = 0; i < pairs.length; i++) {
replicaInfos[i] = cache.fetchOrCreate(
new ExtendedBlockId(i, "test_bp1"),
new SimpleReplicaCreator(i, cache, pairs[i]));
Preconditions.checkNotNull(replicaInfos[i].getReplica());
Preconditions.checkState(replicaInfos[i].getInvalidTokenException() == null);
pairs[i].compareWith(replicaInfos[i].getReplica().getDataStream(),
replicaInfos[i].getReplica().getMetaStream());
}
// At this point, we have 3 replicas in use.
// Let's close them all.
for (int i = 0; i < pairs.length; i++) {
replicaInfos[i].getReplica().unref();
}
// The last two replicas should still be cached.
for (int i = 1; i < pairs.length; i++) {
final Integer iVal = new Integer(i);
replicaInfos[i] = cache.fetchOrCreate(
new ExtendedBlockId(i, "test_bp1"),
new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
Assert.fail("expected to use existing entry for " + iVal);
return null;
}
});
Preconditions.checkNotNull(replicaInfos[i].getReplica());
Preconditions.checkState(replicaInfos[i].getInvalidTokenException() == null);
pairs[i].compareWith(replicaInfos[i].getReplica().getDataStream(),
replicaInfos[i].getReplica().getMetaStream());
}
// The first (oldest) replica should not be cached.
final MutableBoolean calledCreate = new MutableBoolean(false);
replicaInfos[0] = cache.fetchOrCreate(
new ExtendedBlockId(0, "test_bp1"),
new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
calledCreate.setValue(true);
return null;
}
});
Preconditions.checkState(replicaInfos[0].getReplica() == null);
Assert.assertTrue(calledCreate.isTrue());
// Clean up
for (int i = 1; i < pairs.length; i++) {
replicaInfos[i].getReplica().unref();
}
for (int i = 0; i < pairs.length; i++) {
pairs[i].close();
}
cache.close();
}
@Test(timeout=60000)
public void testStaleness() throws Exception {
// Set up the cache with a short staleness time.
final ShortCircuitCache cache =
new ShortCircuitCache(2, 10000000, 1, 10000000, 1, 10);
final TestFileDescriptorPair pairs[] = new TestFileDescriptorPair[] {
new TestFileDescriptorPair(),
new TestFileDescriptorPair(),
};
ShortCircuitReplicaInfo replicaInfos[] = new ShortCircuitReplicaInfo[] {
null,
null
};
final long HOUR_IN_MS = 60 * 60 * 1000;
for (int i = 0; i < pairs.length; i++) {
final Integer iVal = new Integer(i);
final ExtendedBlockId key = new ExtendedBlockId(i, "test_bp1");
replicaInfos[i] = cache.fetchOrCreate(key,
new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
try {
return new ShortCircuitReplicaInfo(
new ShortCircuitReplica(key,
pairs[iVal].getFileInputStreams()[0],
pairs[iVal].getFileInputStreams()[1],
cache, Time.monotonicNow() + (iVal * HOUR_IN_MS)));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
Preconditions.checkNotNull(replicaInfos[i].getReplica());
Preconditions.checkState(replicaInfos[i].getInvalidTokenException() == null);
pairs[i].compareWith(replicaInfos[i].getReplica().getDataStream(),
replicaInfos[i].getReplica().getMetaStream());
}
// Keep trying to getOrCreate block 0 until it goes stale (and we must re-create.)
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
ShortCircuitReplicaInfo info = cache.fetchOrCreate(
new ExtendedBlockId(0, "test_bp1"), new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
return null;
}
});
if (info.getReplica() != null) {
info.getReplica().unref();
return false;
}
return true;
}
}, 500, 60000);
// Make sure that second replica did not go stale.
ShortCircuitReplicaInfo info = cache.fetchOrCreate(
new ExtendedBlockId(1, "test_bp1"), new ShortCircuitReplicaCreator() {
@Override
public ShortCircuitReplicaInfo createShortCircuitReplicaInfo() {
Assert.fail("second replica went stale, despite 1 " +
"hour staleness time.");
return null;
}
});
info.getReplica().unref();
// Clean up
for (int i = 1; i < pairs.length; i++) {
replicaInfos[i].getReplica().unref();
}
cache.close();
}
}

View File

@ -27,6 +27,7 @@
import java.net.URI;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.UUID;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.conf.Configuration;
@ -35,8 +36,9 @@
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
@ -125,8 +127,9 @@ static void checkFileContent(URI uri, Path name, byte[] expected,
throws IOException, InterruptedException {
// Ensure short circuit is enabled
DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
ClientContext getClientContext = ClientContext.getFromConf(conf);
if (legacyShortCircuitFails) {
assertTrue(fs.getClient().useLegacyBlockReaderLocal());
assertFalse(getClientContext.getDisableLegacyBlockReaderLocal());
}
FSDataInputStream stm = fs.open(name);
@ -155,7 +158,7 @@ static void checkFileContent(URI uri, Path name, byte[] expected,
checkData(actual, readOffset, expected, "Read 3");
if (legacyShortCircuitFails) {
assertFalse(fs.getClient().useLegacyBlockReaderLocal());
assertTrue(getClientContext.getDisableLegacyBlockReaderLocal());
}
stm.close();
}
@ -175,8 +178,9 @@ static void checkFileContentDirect(URI uri, Path name, byte[] expected,
throws IOException, InterruptedException {
// Ensure short circuit is enabled
DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
ClientContext clientContext = ClientContext.getFromConf(conf);
if (legacyShortCircuitFails) {
assertTrue(fs.getClient().useLegacyBlockReaderLocal());
assertTrue(clientContext.getDisableLegacyBlockReaderLocal());
}
HdfsDataInputStream stm = (HdfsDataInputStream)fs.open(name);
@ -209,7 +213,7 @@ static void checkFileContentDirect(URI uri, Path name, byte[] expected,
}
checkData(arrayFromByteBuffer(actual), readOffset, expected, "Read 3");
if (legacyShortCircuitFails) {
assertFalse(fs.getClient().useLegacyBlockReaderLocal());
assertTrue(clientContext.getDisableLegacyBlockReaderLocal());
}
stm.close();
}
@ -223,7 +227,6 @@ public void doTestShortCircuitReadLegacy(boolean ignoreChecksum, int size,
public void doTestShortCircuitRead(boolean ignoreChecksum, int size,
int readOffset) throws IOException, InterruptedException {
String shortCircuitUser = getCurrentUser();
doTestShortCircuitReadImpl(ignoreChecksum, size, readOffset,
null, getCurrentUser(), false);
}
@ -239,6 +242,10 @@ public void doTestShortCircuitReadImpl(boolean ignoreChecksum, int size,
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
ignoreChecksum);
// Set a random client context name so that we don't share a cache with
// other invocations of this function.
conf.set(DFSConfigKeys.DFS_CLIENT_CONTEXT,
UUID.randomUUID().toString());
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(sockDir.getDir(),
"TestShortCircuitLocalRead._PORT.sock").getAbsolutePath());
@ -322,18 +329,6 @@ public void testLongFile() throws Exception {
doTestShortCircuitRead(true, 10*blockSize+100, 777);
}
private ClientDatanodeProtocol getProxy(UserGroupInformation ugi,
final DatanodeID dnInfo, final Configuration conf) throws IOException,
InterruptedException {
return ugi.doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
@Override
public ClientDatanodeProtocol run() throws Exception {
return DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf, 60000,
false);
}
});
}
private static DistributedFileSystem getFileSystem(String user, final URI uri,
final Configuration conf) throws InterruptedException, IOException {
UserGroupInformation ugi = UserGroupInformation.createRemoteUser(user);
@ -555,8 +550,7 @@ public void run() {
for (int i = 0; i < iteration; i++) {
try {
String user = getCurrentUser();
checkFileContent(fs.getUri(), file1, dataToWrite, 0, user, conf,
true);
checkFileContent(fs.getUri(), file1, dataToWrite, 0, user, conf, true);
} catch (IOException e) {
e.printStackTrace();
} catch (InterruptedException e) {
@ -608,7 +602,8 @@ public void doTestShortCircuitReadWithRemoteBlockReader(boolean ignoreChecksum,
stm.write(fileData);
stm.close();
try {
checkFileContent(uri, file1, fileData, readOffset, shortCircuitUser, conf, shortCircuitFails);
checkFileContent(uri, file1, fileData, readOffset, shortCircuitUser,
conf, shortCircuitFails);
//RemoteBlockReader have unsupported method read(ByteBuffer bf)
assertTrue("RemoteBlockReader unsupported method read(ByteBuffer bf) error",
checkUnsupportedMethod(fs, file1, fileData, readOffset));

View File

@ -38,10 +38,16 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.BlockReader;
import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.ClientContext;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.DFSClient.Conf;
import org.apache.hadoop.hdfs.RemotePeerFactory;
import org.apache.hadoop.hdfs.client.ShortCircuitCache;
import org.apache.hadoop.hdfs.client.ShortCircuitReplica;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -55,10 +61,13 @@
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Level;
import org.junit.Assert;
import org.junit.Test;
public class TestBlockTokenWithDFS {
@ -131,50 +140,70 @@ private static FSDataOutputStream writeFile(FileSystem fileSys, Path name,
}
// try reading a block using a BlockReader directly
private static void tryRead(Configuration conf, LocatedBlock lblock,
private static void tryRead(final Configuration conf, LocatedBlock lblock,
boolean shouldSucceed) {
InetSocketAddress targetAddr = null;
Socket s = null;
IOException ioe = null;
BlockReader blockReader = null;
ExtendedBlock block = lblock.getBlock();
try {
DatanodeInfo[] nodes = lblock.getLocations();
targetAddr = NetUtils.createSocketAddr(nodes[0].getXferAddr());
s = NetUtils.getDefaultSocketFactory(conf).createSocket();
s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
String file = BlockReaderFactory.getFileName(targetAddr,
"test-blockpoolid", block.getBlockId());
blockReader = BlockReaderFactory.newBlockReader(
new DFSClient.Conf(conf), file, block, lblock.getBlockToken(), 0, -1,
true, "TestBlockTokenWithDFS", TcpPeerServer.peerFromSocket(s),
nodes[0], null, null, null, false,
CachingStrategy.newDefaultStrategy());
blockReader = new BlockReaderFactory(new DFSClient.Conf(conf)).
setFileName(BlockReaderFactory.getFileName(targetAddr,
"test-blockpoolid", block.getBlockId())).
setBlock(block).
setBlockToken(lblock.getBlockToken()).
setInetSocketAddress(targetAddr).
setStartOffset(0).
setLength(-1).
setVerifyChecksum(true).
setClientName("TestBlockTokenWithDFS").
setDatanodeInfo(nodes[0]).
setCachingStrategy(CachingStrategy.newDefaultStrategy()).
setClientCacheContext(ClientContext.getFromConf(conf)).
setConfiguration(conf).
setRemotePeerFactory(new RemotePeerFactory() {
@Override
public Peer newConnectedPeer(InetSocketAddress addr)
throws IOException {
Peer peer = null;
Socket sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
try {
sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
peer = TcpPeerServer.peerFromSocket(sock);
} finally {
if (peer == null) {
IOUtils.closeSocket(sock);
}
}
return peer;
}
}).
build();
} catch (IOException ex) {
if (ex instanceof InvalidBlockTokenException) {
assertFalse("OP_READ_BLOCK: access token is invalid, "
+ "when it is expected to be valid", shouldSucceed);
return;
}
fail("OP_READ_BLOCK failed due to reasons other than access token: "
+ StringUtils.stringifyException(ex));
ioe = ex;
} finally {
if (s != null) {
if (blockReader != null) {
try {
s.close();
} catch (IOException iex) {
} finally {
s = null;
blockReader.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
if (blockReader == null) {
fail("OP_READ_BLOCK failed due to reasons other than access token");
if (shouldSucceed) {
Assert.assertNotNull("OP_READ_BLOCK: access token is invalid, "
+ "when it is expected to be valid", blockReader);
} else {
Assert.assertNotNull("OP_READ_BLOCK: access token is valid, "
+ "when it is expected to be invalid", ioe);
Assert.assertTrue(
"OP_READ_BLOCK failed due to reasons other than access token: ",
ioe instanceof InvalidBlockTokenException);
}
assertTrue("OP_READ_BLOCK: access token is valid, "
+ "when it is expected to be invalid", shouldSucceed);
}
// get a conf for testing
@ -347,9 +376,13 @@ public void testRead() throws Exception {
/*
* testing READ interface on DN using a BlockReader
*/
new DFSClient(new InetSocketAddress("localhost",
DFSClient client = null;
try {
client = new DFSClient(new InetSocketAddress("localhost",
cluster.getNameNodePort()), conf);
} finally {
if (client != null) client.close();
}
List<LocatedBlock> locatedBlocks = nnProto.getBlockLocations(
FILE_TO_READ, 0, FILE_SIZE).getLocatedBlocks();
LocatedBlock lblock = locatedBlocks.get(0); // first block

View File

@ -35,11 +35,14 @@
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.BlockReader;
import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.ClientContext;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.RemotePeerFactory;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
@ -48,13 +51,14 @@
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
import org.apache.hadoop.hdfs.server.protocol.StorageBlockReport;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
@ -284,23 +288,43 @@ private boolean deteteBlocks(File dir) {
private void accessBlock(DatanodeInfo datanode, LocatedBlock lblock)
throws IOException {
InetSocketAddress targetAddr = null;
Socket s = null;
ExtendedBlock block = lblock.getBlock();
targetAddr = NetUtils.createSocketAddr(datanode.getXferAddr());
s = NetUtils.getDefaultSocketFactory(conf).createSocket();
s.connect(targetAddr, HdfsServerConstants.READ_TIMEOUT);
s.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
String file = BlockReaderFactory.getFileName(targetAddr,
"test-blockpoolid",
block.getBlockId());
BlockReader blockReader =
BlockReaderFactory.newBlockReader(new DFSClient.Conf(conf), file, block,
lblock.getBlockToken(), 0, -1, true, "TestDataNodeVolumeFailure",
TcpPeerServer.peerFromSocket(s), datanode, null, null, null, false,
CachingStrategy.newDefaultStrategy());
BlockReader blockReader = new BlockReaderFactory(new DFSClient.Conf(conf)).
setInetSocketAddress(targetAddr).
setBlock(block).
setFileName(BlockReaderFactory.getFileName(targetAddr,
"test-blockpoolid", block.getBlockId())).
setBlockToken(lblock.getBlockToken()).
setStartOffset(0).
setLength(-1).
setVerifyChecksum(true).
setClientName("TestDataNodeVolumeFailure").
setDatanodeInfo(datanode).
setCachingStrategy(CachingStrategy.newDefaultStrategy()).
setClientCacheContext(ClientContext.getFromConf(conf)).
setConfiguration(conf).
setRemotePeerFactory(new RemotePeerFactory() {
@Override
public Peer newConnectedPeer(InetSocketAddress addr)
throws IOException {
Peer peer = null;
Socket sock = NetUtils.getDefaultSocketFactory(conf).createSocket();
try {
sock.connect(addr, HdfsServerConstants.READ_TIMEOUT);
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
peer = TcpPeerServer.peerFromSocket(sock);
} finally {
if (peer == null) {
IOUtils.closeSocket(sock);
}
}
return peer;
}
}).
build();
blockReader.close();
}

View File

@ -92,7 +92,7 @@ public void testDataLocality() throws Exception {
//The chosen datanode must be the same as the client address
final DatanodeInfo chosen = NamenodeWebHdfsMethods.chooseDatanode(
namenode, f, PutOpParam.Op.CREATE, -1L, blocksize, conf);
namenode, f, PutOpParam.Op.CREATE, -1L, blocksize);
Assert.assertEquals(ipAddr, chosen.getIpAddr());
}
}
@ -117,19 +117,19 @@ public void testDataLocality() throws Exception {
{ //test GETFILECHECKSUM
final DatanodeInfo chosen = NamenodeWebHdfsMethods.chooseDatanode(
namenode, f, GetOpParam.Op.GETFILECHECKSUM, -1L, blocksize, conf);
namenode, f, GetOpParam.Op.GETFILECHECKSUM, -1L, blocksize);
Assert.assertEquals(expected, chosen);
}
{ //test OPEN
final DatanodeInfo chosen = NamenodeWebHdfsMethods.chooseDatanode(
namenode, f, GetOpParam.Op.OPEN, 0, blocksize, conf);
namenode, f, GetOpParam.Op.OPEN, 0, blocksize);
Assert.assertEquals(expected, chosen);
}
{ //test APPEND
final DatanodeInfo chosen = NamenodeWebHdfsMethods.chooseDatanode(
namenode, f, PostOpParam.Op.APPEND, -1L, blocksize, conf);
namenode, f, PostOpParam.Op.APPEND, -1L, blocksize);
Assert.assertEquals(expected, chosen);
}
} finally {

View File

@ -136,6 +136,7 @@ public void testFileNameEncoding() throws IOException, URISyntaxException {
out.close();
FSDataInputStream in = hftpFs.open(p);
assertEquals('0', in.read());
in.close();
// Check the file status matches the path. Hftp returns a FileStatus
// with the entire URI, extract the path part.
@ -250,6 +251,7 @@ public void testSeek() throws IOException {
FSDataInputStream in = hftpFs.open(testFile);
in.seek(7);
assertEquals('7', in.read());
in.close();
}
@Test

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.web;
import java.io.File;
import java.io.InputStream;
import java.net.InetSocketAddress;
import java.net.URI;
@ -92,6 +93,9 @@ public void testSWebHdfsFileSystem() throws Exception {
os.write(23);
os.close();
Assert.assertTrue(fs.exists(f));
InputStream is = fs.open(f);
Assert.assertEquals(23, is.read());
is.close();
fs.close();
}
}

View File

@ -158,6 +158,9 @@ Release 2.4.0 - UNRELEASED
OPTIMIZATIONS
BUG FIXES
MAPREDUCE-5746. Job diagnostics can implicate wrong task for a failed job.
(Jason Lowe via kasha)
Release 2.3.1 - UNRELEASED

View File

@ -353,8 +353,10 @@ private void handleTaskFailedEvent(TaskFailedEvent event) {
taskInfo.error = StringInterner.weakIntern(event.getError());
taskInfo.failedDueToAttemptId = event.getFailedAttemptID();
taskInfo.counters = event.getCounters();
info.errorInfo = "Task " + taskInfo.taskId +" failed " +
taskInfo.attemptsMap.size() + " times ";
if (info.errorInfo.isEmpty()) {
info.errorInfo = "Task " + taskInfo.taskId + " failed " +
taskInfo.attemptsMap.size() + " times ";
}
}
private void handleTaskStartedEvent(TaskStartedEvent event) {

View File

@ -40,6 +40,8 @@
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.JobID;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.hadoop.mapreduce.TypeConverter;
@ -51,7 +53,9 @@
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.jobhistory.TaskFailedEvent;
import org.apache.hadoop.mapreduce.jobhistory.TaskFinishedEvent;
import org.apache.hadoop.mapreduce.jobhistory.TaskStartedEvent;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.JobState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
@ -69,7 +73,6 @@
import org.apache.hadoop.mapreduce.v2.hs.HistoryFileManager.HistoryFileInfo;
import org.apache.hadoop.mapreduce.v2.hs.TestJobHistoryEvents.MRAppWithHistory;
import org.apache.hadoop.mapreduce.v2.hs.webapp.dao.JobsInfo;
import org.apache.hadoop.mapreduce.v2.jobhistory.FileNameIndexUtils;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobIndexInfo;
import org.apache.hadoop.net.DNSToSwitchMapping;
@ -730,4 +733,40 @@ public void testPartialJob() throws Exception {
assertNull(test.getAMInfos());
}
@Test
public void testMultipleFailedTasks() throws Exception {
JobHistoryParser parser =
new JobHistoryParser(Mockito.mock(FSDataInputStream.class));
EventReader reader = Mockito.mock(EventReader.class);
final AtomicInteger numEventsRead = new AtomicInteger(0); // Hack!
final org.apache.hadoop.mapreduce.TaskType taskType =
org.apache.hadoop.mapreduce.TaskType.MAP;
final TaskID[] tids = new TaskID[2];
JobID jid = new JobID("1", 1);
tids[0] = new TaskID(jid, taskType, 0);
tids[1] = new TaskID(jid, taskType, 1);
Mockito.when(reader.getNextEvent()).thenAnswer(
new Answer<HistoryEvent>() {
public HistoryEvent answer(InvocationOnMock invocation)
throws IOException {
// send two task start and two task fail events for tasks 0 and 1
int eventId = numEventsRead.getAndIncrement();
TaskID tid = tids[eventId & 0x1];
if (eventId < 2) {
return new TaskStartedEvent(tid, 0, taskType, "");
}
if (eventId < 4) {
TaskFailedEvent tfe = new TaskFailedEvent(tid, 0, taskType,
"failed", "FAILED", null, new Counters());
tfe.setDatum(tfe.getDatum());
return tfe;
}
return null;
}
});
JobInfo info = parser.parse(reader);
assertTrue("Task 0 not implicated",
info.getErrorInfo().contains(tids[0].toString()));
}
}

View File

@ -6,20 +6,6 @@ Trunk - Unreleased
NEW FEATURES
YARN-1496. Protocol additions to allow moving apps between queues (Sandy
Ryza)
YARN-1498. Common scheduler changes for moving apps between queues (Sandy
Ryza)
YARN-1504. RM changes for moving apps between queues (Sandy Ryza)
YARN-1499. Fair Scheduler changes for moving apps between queues (Sandy
Ryza)
YARN-1497. Command line additions for moving apps between queues (Sandy
Ryza)
IMPROVEMENTS
OPTIMIZATIONS
@ -121,6 +107,20 @@ Release 2.4.0 - UNRELEASED
YARN-1637. Implemented a client library for Java users to post timeline
entities and events. (zjshen)
YARN-1496. Protocol additions to allow moving apps between queues (Sandy
Ryza)
YARN-1498. Common scheduler changes for moving apps between queues (Sandy
Ryza)
YARN-1504. RM changes for moving apps between queues (Sandy Ryza)
YARN-1499. Fair Scheduler changes for moving apps between queues (Sandy
Ryza)
YARN-1497. Command line additions for moving apps between queues (Sandy
Ryza)
IMPROVEMENTS
@ -176,6 +176,14 @@ Release 2.4.0 - UNRELEASED
YARN-1706. Created an utility method to dump timeline records to JSON
strings. (zjshen)
YARN-1641. ZK store should attempt a write periodically to ensure it is
still Active. (kasha)
YARN-1531. True up yarn command documentation (Akira Ajisaka via kasha)
YARN-1345. Remove FINAL_SAVING state from YarnApplicationAttemptState
(Zhijie Shen via jianhe)
OPTIMIZATIONS
BUG FIXES
@ -246,6 +254,17 @@ Release 2.4.0 - UNRELEASED
YARN-1698. Fixed default TimelineStore in code to match what is documented
in yarn-default.xml (Zhijie Shen via vinodkv)
YARN-1697. NodeManager reports negative running containers (Sandy Ryza)
YARN-1719. Fixed the root path related Jersey warnings produced in
ATSWebServices. (Billie Rinaldi via zjshen)
YARN-1692. ConcurrentModificationException in fair scheduler AppSchedulable
(Sangjin Lee via Sandy Ryza)
YARN-1578. Fixed reading incomplete application attempt and container data
in FileSystemApplicationHistoryStore. (Shinichi Yamashita via zjshen)
Release 2.3.1 - UNRELEASED
INCOMPATIBLE CHANGES
@ -256,7 +275,7 @@ Release 2.3.1 - UNRELEASED
OPTIMIZATIONS
BUG FIXES
BUG FIXES
Release 2.3.0 - 2014-02-18

View File

@ -51,9 +51,6 @@ public enum YarnApplicationAttemptState {
/** AppAttempt is currently running. */
RUNNING,
/** AppAttempt is waiting for state bing saved */
FINAL_SAVING,
/** AppAttempt is finishing. */
FINISHING,

View File

@ -120,10 +120,9 @@ enum YarnApplicationAttemptStateProto {
APP_ATTEMPT_LAUNCHED = 6;
APP_ATTEMPT_FAILED = 7;
APP_ATTEMPT_RUNNING = 8;
APP_ATTEMPT_FINAL_SAVING = 9;
APP_ATTEMPT_FINISHING = 10;
APP_ATTEMPT_FINISHED = 11;
APP_ATTEMPT_KILLED = 12;
APP_ATTEMPT_FINISHING = 9;
APP_ATTEMPT_FINISHED = 10;
APP_ATTEMPT_KILLED = 11;
}
enum FinalApplicationStatusProto {

View File

@ -215,17 +215,30 @@ public Map<ApplicationId, ApplicationHistoryData> getAllApplications()
getApplicationAttempts(ApplicationId appId) throws IOException {
Map<ApplicationAttemptId, ApplicationAttemptHistoryData> historyDataMap =
new HashMap<ApplicationAttemptId, ApplicationAttemptHistoryData>();
Map<ApplicationAttemptId, StartFinishDataPair<ApplicationAttemptStartData, ApplicationAttemptFinishData>> startFinshDataMap =
new HashMap<ApplicationAttemptId, StartFinishDataPair<ApplicationAttemptStartData, ApplicationAttemptFinishData>>();
HistoryFileReader hfReader = getHistoryFileReader(appId);
try {
while (hfReader.hasNext()) {
HistoryFileReader.Entry entry = hfReader.next();
if (entry.key.id.startsWith(ConverterUtils.APPLICATION_ATTEMPT_PREFIX)) {
if (entry.key.suffix.equals(START_DATA_SUFFIX)) {
retrieveStartFinishData(appId, entry, startFinshDataMap, true);
} else if (entry.key.suffix.equals(FINISH_DATA_SUFFIX)) {
retrieveStartFinishData(appId, entry, startFinshDataMap, false);
if (entry.key.id.startsWith(
ConverterUtils.APPLICATION_ATTEMPT_PREFIX)) {
ApplicationAttemptId appAttemptId =
ConverterUtils.toApplicationAttemptId(entry.key.id);
if (appAttemptId.getApplicationId().equals(appId)) {
ApplicationAttemptHistoryData historyData =
historyDataMap.get(appAttemptId);
if (historyData == null) {
historyData = ApplicationAttemptHistoryData.newInstance(
appAttemptId, null, -1, null, null, null,
FinalApplicationStatus.UNDEFINED, null);
historyDataMap.put(appAttemptId, historyData);
}
if (entry.key.suffix.equals(START_DATA_SUFFIX)) {
mergeApplicationAttemptHistoryData(historyData,
parseApplicationAttemptStartData(entry.value));
} else if (entry.key.suffix.equals(FINISH_DATA_SUFFIX)) {
mergeApplicationAttemptHistoryData(historyData,
parseApplicationAttemptFinishData(entry.value));
}
}
}
}
@ -237,45 +250,9 @@ public Map<ApplicationId, ApplicationHistoryData> getAllApplications()
} finally {
hfReader.close();
}
for (Map.Entry<ApplicationAttemptId, StartFinishDataPair<ApplicationAttemptStartData, ApplicationAttemptFinishData>> entry : startFinshDataMap
.entrySet()) {
ApplicationAttemptHistoryData historyData =
ApplicationAttemptHistoryData.newInstance(entry.getKey(), null, -1,
null, null, null, FinalApplicationStatus.UNDEFINED, null);
mergeApplicationAttemptHistoryData(historyData,
entry.getValue().startData);
mergeApplicationAttemptHistoryData(historyData,
entry.getValue().finishData);
historyDataMap.put(entry.getKey(), historyData);
}
return historyDataMap;
}
private
void
retrieveStartFinishData(
ApplicationId appId,
HistoryFileReader.Entry entry,
Map<ApplicationAttemptId, StartFinishDataPair<ApplicationAttemptStartData, ApplicationAttemptFinishData>> startFinshDataMap,
boolean start) throws IOException {
ApplicationAttemptId appAttemptId =
ConverterUtils.toApplicationAttemptId(entry.key.id);
if (appAttemptId.getApplicationId().equals(appId)) {
StartFinishDataPair<ApplicationAttemptStartData, ApplicationAttemptFinishData> pair =
startFinshDataMap.get(appAttemptId);
if (pair == null) {
pair =
new StartFinishDataPair<ApplicationAttemptStartData, ApplicationAttemptFinishData>();
startFinshDataMap.put(appAttemptId, pair);
}
if (start) {
pair.startData = parseApplicationAttemptStartData(entry.value);
} else {
pair.finishData = parseApplicationAttemptFinishData(entry.value);
}
}
}
@Override
public ApplicationAttemptHistoryData getApplicationAttempt(
ApplicationAttemptId appAttemptId) throws IOException {
@ -391,20 +368,30 @@ public Map<ContainerId, ContainerHistoryData> getContainers(
ApplicationAttemptId appAttemptId) throws IOException {
Map<ContainerId, ContainerHistoryData> historyDataMap =
new HashMap<ContainerId, ContainerHistoryData>();
Map<ContainerId, StartFinishDataPair<ContainerStartData, ContainerFinishData>> startFinshDataMap =
new HashMap<ContainerId, StartFinishDataPair<ContainerStartData, ContainerFinishData>>();
HistoryFileReader hfReader =
getHistoryFileReader(appAttemptId.getApplicationId());
try {
while (hfReader.hasNext()) {
HistoryFileReader.Entry entry = hfReader.next();
if (entry.key.id.startsWith(ConverterUtils.CONTAINER_PREFIX)) {
if (entry.key.suffix.equals(START_DATA_SUFFIX)) {
retrieveStartFinishData(appAttemptId, entry, startFinshDataMap,
true);
} else if (entry.key.suffix.equals(FINISH_DATA_SUFFIX)) {
retrieveStartFinishData(appAttemptId, entry, startFinshDataMap,
false);
ContainerId containerId =
ConverterUtils.toContainerId(entry.key.id);
if (containerId.getApplicationAttemptId().equals(appAttemptId)) {
ContainerHistoryData historyData =
historyDataMap.get(containerId);
if (historyData == null) {
historyData = ContainerHistoryData.newInstance(
containerId, null, null, null, Long.MIN_VALUE,
Long.MAX_VALUE, null, null, Integer.MAX_VALUE, null);
historyDataMap.put(containerId, historyData);
}
if (entry.key.suffix.equals(START_DATA_SUFFIX)) {
mergeContainerHistoryData(historyData,
parseContainerStartData(entry.value));
} else if (entry.key.suffix.equals(FINISH_DATA_SUFFIX)) {
mergeContainerHistoryData(historyData,
parseContainerFinishData(entry.value));
}
}
}
}
@ -416,43 +403,9 @@ public Map<ContainerId, ContainerHistoryData> getContainers(
} finally {
hfReader.close();
}
for (Map.Entry<ContainerId, StartFinishDataPair<ContainerStartData, ContainerFinishData>> entry : startFinshDataMap
.entrySet()) {
ContainerHistoryData historyData =
ContainerHistoryData
.newInstance(entry.getKey(), null, null, null, Long.MIN_VALUE,
Long.MAX_VALUE, null, null, Integer.MAX_VALUE, null);
mergeContainerHistoryData(historyData, entry.getValue().startData);
mergeContainerHistoryData(historyData, entry.getValue().finishData);
historyDataMap.put(entry.getKey(), historyData);
}
return historyDataMap;
}
private
void
retrieveStartFinishData(
ApplicationAttemptId appAttemptId,
HistoryFileReader.Entry entry,
Map<ContainerId, StartFinishDataPair<ContainerStartData, ContainerFinishData>> startFinshDataMap,
boolean start) throws IOException {
ContainerId containerId = ConverterUtils.toContainerId(entry.key.id);
if (containerId.getApplicationAttemptId().equals(appAttemptId)) {
StartFinishDataPair<ContainerStartData, ContainerFinishData> pair =
startFinshDataMap.get(containerId);
if (pair == null) {
pair =
new StartFinishDataPair<ContainerStartData, ContainerFinishData>();
startFinshDataMap.put(containerId, pair);
}
if (start) {
pair.startData = parseContainerStartData(entry.value);
} else {
pair.finishData = parseContainerFinishData(entry.value);
}
}
}
@Override
public void applicationStarted(ApplicationStartData appStart)
throws IOException {
@ -828,14 +781,5 @@ public void readFields(DataInput in) throws IOException {
id = in.readUTF();
suffix = in.readUTF();
}
}
private static class StartFinishDataPair<S, F> {
private S startData;
private F finishData;
}
}

View File

@ -107,7 +107,6 @@ public void setAbout(String about) {
* Return the description of the application timeline web services.
*/
@GET
@Path("/")
@Produces({ MediaType.APPLICATION_JSON /* , MediaType.APPLICATION_XML */})
public AboutInfo about(
@Context HttpServletRequest req,
@ -235,7 +234,6 @@ public ATSEvents getEvents(
* that happen during storing.
*/
@POST
@Path("/")
@Consumes({ MediaType.APPLICATION_JSON /* , MediaType.APPLICATION_XML */})
public ATSPutErrors postEntities(
@Context HttpServletRequest req,

View File

@ -72,6 +72,12 @@ public void testReadWriteHistoryData() throws IOException {
}
private void testWriteHistoryData(int num) throws IOException {
testWriteHistoryData(num, false, false);
}
private void testWriteHistoryData(
int num, boolean missingContainer, boolean missingApplicationAttempt)
throws IOException {
// write application history data
for (int i = 1; i <= num; ++i) {
ApplicationId appId = ApplicationId.newInstance(0, i);
@ -83,21 +89,31 @@ private void testWriteHistoryData(int num) throws IOException {
ApplicationAttemptId.newInstance(appId, j);
writeApplicationAttemptStartData(appAttemptId);
if (missingApplicationAttempt && j == num) {
continue;
}
// write container history data
for (int k = 1; k <= num; ++k) {
ContainerId containerId = ContainerId.newInstance(appAttemptId, k);
writeContainerStartData(containerId);
if (missingContainer && k == num) {
continue;
}
writeContainerFinishData(containerId);
writeApplicationAttemptFinishData(appAttemptId);
}
writeApplicationAttemptFinishData(appAttemptId);
}
writeApplicationFinishData(appId);
}
}
private void testReadHistoryData(int num) throws IOException {
testReadHistoryData(num, false, false);
}
private void testReadHistoryData(
int num, boolean missingContainer, boolean missingApplicationAttempt)
throws IOException {
// read application history data
Assert.assertEquals(num, store.getAllApplications().size());
for (int i = 1; i <= num; ++i) {
@ -116,8 +132,14 @@ private void testReadHistoryData(int num) throws IOException {
store.getApplicationAttempt(appAttemptId);
Assert.assertNotNull(attemptData);
Assert.assertEquals(appAttemptId.toString(), attemptData.getHost());
Assert.assertEquals(appAttemptId.toString(),
attemptData.getDiagnosticsInfo());
if (missingApplicationAttempt && j == num) {
Assert.assertNull(attemptData.getDiagnosticsInfo());
continue;
} else {
Assert.assertEquals(appAttemptId.toString(),
attemptData.getDiagnosticsInfo());
}
// read container history data
Assert.assertEquals(num, store.getContainers(appAttemptId).size());
@ -127,8 +149,12 @@ private void testReadHistoryData(int num) throws IOException {
Assert.assertNotNull(containerData);
Assert.assertEquals(Priority.newInstance(containerId.getId()),
containerData.getPriority());
Assert.assertEquals(containerId.toString(),
containerData.getDiagnosticsInfo());
if (missingContainer && k == num) {
Assert.assertNull(containerData.getDiagnosticsInfo());
} else {
Assert.assertEquals(containerId.toString(),
containerData.getDiagnosticsInfo());
}
}
ContainerHistoryData masterContainer =
store.getAMContainer(appAttemptId);
@ -193,4 +219,15 @@ public void testMassiveWriteContainerHistoryData() throws IOException {
Assert.assertTrue((usedDiskAfter - usedDiskBefore) < 20);
}
@Test
public void testMissingContainerHistoryData() throws IOException {
testWriteHistoryData(3, true, false);
testReadHistoryData(3, true, false);
}
@Test
public void testMissingApplicationAttemptHistoryData() throws IOException {
testWriteHistoryData(3, false, true);
testReadHistoryData(3, false, true);
}
}

View File

@ -83,6 +83,7 @@ public class ContainerImpl implements Container {
private final String user;
private int exitCode = ContainerExitStatus.INVALID;
private final StringBuilder diagnostics;
private boolean wasLaunched;
/** The NM-wide configuration - not specific to this container */
private final Configuration daemonConf;
@ -418,7 +419,9 @@ private void finished() {
applicationId, containerId);
break;
case EXITED_WITH_FAILURE:
metrics.endRunningContainer();
if (wasLaunched) {
metrics.endRunningContainer();
}
// fall through
case LOCALIZATION_FAILED:
metrics.failedContainer();
@ -428,7 +431,9 @@ private void finished() {
applicationId, containerId);
break;
case CONTAINER_CLEANEDUP_AFTER_KILL:
metrics.endRunningContainer();
if (wasLaunched) {
metrics.endRunningContainer();
}
// fall through
case NEW:
metrics.killedContainer();
@ -636,6 +641,7 @@ public void transition(ContainerImpl container, ContainerEvent event) {
new ContainerStartMonitoringEvent(container.containerId,
vmemBytes, pmemBytes));
container.metrics.runningContainer();
container.wasLaunched = true;
}
}

View File

@ -99,4 +99,8 @@ public void releaseContainer(Resource res) {
public void addResource(Resource res) {
availableGB.incr(res.getMemory() / 1024);
}
public int getRunningContainers() {
return containersRunning.value();
}
}

View File

@ -348,6 +348,9 @@ public void testKillOnLocalizedWhenContainerNotLaunched() throws Exception {
wc.c.getContainerState());
assertNull(wc.c.getLocalizedResources());
verifyCleanupCall(wc);
wc.c.handle(new ContainerEvent(wc.c.getContainerId(),
ContainerEventType.CONTAINER_RESOURCES_CLEANEDUP));
assertEquals(0, metrics.getRunningContainers());
} finally {
if (wc != null) {
wc.finished();

View File

@ -214,8 +214,6 @@ public static YarnApplicationAttemptState createApplicationAttemptState(
return YarnApplicationAttemptState.RUNNING;
case FINISHING:
return YarnApplicationAttemptState.FINISHING;
case FINAL_SAVING:
return YarnApplicationAttemptState.FINAL_SAVING;
case FINISHED:
return YarnApplicationAttemptState.FINISHED;
case KILLED:

View File

@ -676,11 +676,11 @@ protected void handleStoreEvent(RMStateStoreEvent event) {
@SuppressWarnings("unchecked")
/**
* In {#handleStoreEvent}, this method is called to notify the
* ResourceManager that the store operation has failed.
* This method is called to notify the ResourceManager that the store
* operation has failed.
* @param failureCause the exception due to which the operation failed
*/
private void notifyStoreOperationFailed(Exception failureCause) {
protected void notifyStoreOperationFailed(Exception failureCause) {
RMFatalEventType type;
if (failureCause instanceof StoreFencedException) {
type = RMFatalEventType.STATE_STORE_FENCED;

View File

@ -137,6 +137,7 @@ public class ZKRMStateStore extends RMStateStore {
private String fencingNodePath;
private Op createFencingNodePathOp;
private Op deleteFencingNodePathOp;
private Thread verifyActiveStatusThread;
private String zkRootNodeUsername;
private final String zkRootNodePassword = Long.toString(random.nextLong());
@ -258,6 +259,8 @@ public synchronized void startInternal() throws Exception {
createRootDir(zkRootNodePath);
if (HAUtil.isHAEnabled(getConfig())){
fence();
verifyActiveStatusThread = new VerifyActiveStatusThread();
verifyActiveStatusThread.start();
}
createRootDir(rmAppRoot);
createRootDir(rmDTSecretManagerRoot);
@ -350,6 +353,10 @@ private synchronized void closeZkClients() throws IOException {
@Override
protected synchronized void closeInternal() throws Exception {
if (verifyActiveStatusThread != null) {
verifyActiveStatusThread.interrupt();
verifyActiveStatusThread.join(1000);
}
closeZkClients();
}
@ -856,6 +863,32 @@ List<String> run() throws KeeperException, InterruptedException {
}.runWithRetries();
}
/**
* Helper class that periodically attempts creating a znode to ensure that
* this RM continues to be the Active.
*/
private class VerifyActiveStatusThread extends Thread {
private List<Op> emptyOpList = new ArrayList<Op>();
VerifyActiveStatusThread() {
super(VerifyActiveStatusThread.class.getName());
}
public void run() {
try {
while (true) {
doMultiWithRetries(emptyOpList);
Thread.sleep(zkSessionTimeout);
}
} catch (InterruptedException ie) {
LOG.info(VerifyActiveStatusThread.class.getName() + " thread " +
"interrupted! Exiting!");
} catch (Exception e) {
notifyStoreOperationFailed(new StoreFencedException());
}
}
}
private abstract class ZKAction<T> {
// run() expects synchronization on ZKRMStateStore.this
abstract T run() throws KeeperException, InterruptedException;

View File

@ -157,6 +157,7 @@ public class RMAppAttemptImpl implements RMAppAttempt, Recoverable {
private RMAppAttemptEvent eventCausingFinalSaving;
private RMAppAttemptState targetedFinalState;
private RMAppAttemptState recoveredFinalState;
private RMAppAttemptState stateBeforeFinalSaving;
private Object transitionTodo;
private static final StateMachineFactory<RMAppAttemptImpl,
@ -885,6 +886,7 @@ private void rememberTargetTransitionsAndStoreState(RMAppAttemptEvent event,
RMAppAttemptState stateToBeStored) {
rememberTargetTransitions(event, transitionToDo, targetFinalState);
stateBeforeFinalSaving = getState();
// As of today, finalState, diagnostics, final-tracking-url and
// finalAppStatus are the only things that we store into the StateStore
@ -1536,6 +1538,10 @@ public RMAppAttemptState getState() {
@Override
public YarnApplicationAttemptState createApplicationAttemptState() {
RMAppAttemptState state = getState();
// If AppAttempt is in FINAL_SAVING state, return its previous state.
if (state.equals(RMAppAttemptState.FINAL_SAVING)) {
state = stateBeforeFinalSaving;
}
return RMServerUtils.createApplicationAttemptState(state);
}

View File

@ -150,7 +150,7 @@ public Collection<Priority> getPriorities() {
return appSchedulingInfo.getPriorities();
}
public ResourceRequest getResourceRequest(Priority priority, String resourceName) {
public synchronized ResourceRequest getResourceRequest(Priority priority, String resourceName) {
return this.appSchedulingInfo.getResourceRequest(priority, resourceName);
}
@ -158,7 +158,7 @@ public synchronized int getTotalRequiredResources(Priority priority) {
return getResourceRequest(priority, ResourceRequest.ANY).getNumContainers();
}
public Resource getResource(Priority priority) {
public synchronized Resource getResource(Priority priority) {
return appSchedulingInfo.getResource(priority);
}

View File

@ -192,7 +192,7 @@ public synchronized float getLocalityWaitFactor(
return Math.min(((float)requiredResources / clusterNodes), 1.0f);
}
public Resource getTotalPendingRequests() {
public synchronized Resource getTotalPendingRequests() {
Resource ret = Resource.newInstance(0, 0);
for (ResourceRequest rr : appSchedulingInfo.getAllResourceRequests()) {
// to avoid double counting we count only "ANY" resource requests

View File

@ -82,10 +82,12 @@ public void updateDemand() {
Resources.addTo(demand, app.getCurrentConsumption());
// Add up outstanding resource requests
for (Priority p : app.getPriorities()) {
for (ResourceRequest r : app.getResourceRequests(p).values()) {
Resource total = Resources.multiply(r.getCapability(), r.getNumContainers());
Resources.addTo(demand, total);
synchronized (app) {
for (Priority p : app.getPriorities()) {
for (ResourceRequest r : app.getResourceRequests(p).values()) {
Resource total = Resources.multiply(r.getCapability(), r.getNumContainers());
Resources.addTo(demand, total);
}
}
}
}

View File

@ -23,10 +23,7 @@
import static org.mockito.Mockito.mock;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -34,15 +31,8 @@
import org.apache.hadoop.ha.HAServiceProtocol;
import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
import org.apache.hadoop.service.Service;
import org.apache.hadoop.yarn.api.protocolrecords.SubmitApplicationRequest;
import org.apache.hadoop.yarn.api.records.ApplicationId;
import org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.Priority;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.conf.HAUtil;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.server.resourcemanager.ClientRMService;
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.RMStateVersion;
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.impl.pb.RMStateVersionPBImpl;
@ -54,6 +44,7 @@
public class TestZKRMStateStore extends RMStateStoreTestBase {
public static final Log LOG = LogFactory.getLog(TestZKRMStateStore.class);
private static final int ZK_TIMEOUT_MS = 1000;
class TestZKRMStateStoreTester implements RMStateStoreHelper {
@ -141,6 +132,7 @@ private Configuration createHARMConf(
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.set(YarnConfiguration.RM_STORE, ZKRMStateStore.class.getName());
conf.set(YarnConfiguration.RM_ZK_ADDRESS, hostPort);
conf.setInt(YarnConfiguration.RM_ZK_TIMEOUT_MS, ZK_TIMEOUT_MS);
conf.set(YarnConfiguration.RM_HA_ID, rmId);
for (String rpcAddress : YarnConfiguration.RM_SERVICES_ADDRESS_CONF_KEYS) {
for (String id : HAUtil.getRMHAIds(conf)) {
@ -182,26 +174,7 @@ public void testFencing() throws Exception {
HAServiceProtocol.HAServiceState.ACTIVE,
rm2.getRMContext().getRMAdminService().getServiceStatus().getState());
// Submitting an application to RM1 to trigger a state store operation.
// RM1 should realize that it got fenced and is not the Active RM anymore.
Map mockMap = mock(Map.class);
ApplicationSubmissionContext asc =
ApplicationSubmissionContext.newInstance(
ApplicationId.newInstance(1000, 1),
"testApplication", // app Name
"default", // queue name
Priority.newInstance(0),
ContainerLaunchContext.newInstance(mockMap, mockMap,
new ArrayList<String>(), mockMap, mock(ByteBuffer.class),
mockMap),
false, // unmanaged AM
true, // cancelTokens
1, // max app attempts
Resource.newInstance(1024, 1));
ClientRMService rmService = rm1.getClientRMService();
rmService.submitApplication(SubmitApplicationRequest.newInstance(asc));
for (int i = 0; i < 30; i++) {
for (int i = 0; i < ZK_TIMEOUT_MS / 50; i++) {
if (HAServiceProtocol.HAServiceState.ACTIVE ==
rm1.getRMContext().getRMAdminService().getServiceStatus().getState()) {
Thread.sleep(100);

View File

@ -57,6 +57,7 @@
import org.apache.hadoop.yarn.api.records.ContainerStatus;
import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
import org.apache.hadoop.yarn.api.records.Resource;
import org.apache.hadoop.yarn.api.records.YarnApplicationAttemptState;
import org.apache.hadoop.yarn.event.AsyncDispatcher;
import org.apache.hadoop.yarn.event.EventHandler;
import org.apache.hadoop.yarn.event.InlineDispatcher;
@ -695,6 +696,8 @@ public void testUnmanagedAMUnexpectedRegistration() {
// launch AM and verify attempt failed
applicationAttempt.handle(new RMAppAttemptRegistrationEvent(
applicationAttempt.getAppAttemptId(), "host", 8042, "oldtrackingurl"));
assertEquals(YarnApplicationAttemptState.SUBMITTED,
applicationAttempt.createApplicationAttemptState());
testAppAttemptSubmittedToFailedState(
"Unmanaged AM must register after AM attempt reaches LAUNCHED state.");
}
@ -710,6 +713,8 @@ public void testUnmanagedAMContainersCleanup() {
// launch AM and verify attempt failed
applicationAttempt.handle(new RMAppAttemptRegistrationEvent(
applicationAttempt.getAppAttemptId(), "host", 8042, "oldtrackingurl"));
assertEquals(YarnApplicationAttemptState.SUBMITTED,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertFalse(transferStateFromPreviousAttempt);
}
@ -720,6 +725,8 @@ public void testNewToKilled() {
new RMAppAttemptEvent(
applicationAttempt.getAppAttemptId(),
RMAppAttemptEventType.KILL));
assertEquals(YarnApplicationAttemptState.NEW,
applicationAttempt.createApplicationAttemptState());
testAppAttemptKilledState(null, EMPTY_DIAGNOSTICS);
verifyTokenCount(applicationAttempt.getAppAttemptId(), 1);
}
@ -740,6 +747,8 @@ public void testSubmittedToKilled() {
new RMAppAttemptEvent(
applicationAttempt.getAppAttemptId(),
RMAppAttemptEventType.KILL));
assertEquals(YarnApplicationAttemptState.SUBMITTED,
applicationAttempt.createApplicationAttemptState());
testAppAttemptKilledState(null, EMPTY_DIAGNOSTICS);
}
@ -750,6 +759,8 @@ public void testScheduledToKilled() {
new RMAppAttemptEvent(
applicationAttempt.getAppAttemptId(),
RMAppAttemptEventType.KILL));
assertEquals(YarnApplicationAttemptState.SCHEDULED,
applicationAttempt.createApplicationAttemptState());
testAppAttemptKilledState(null, EMPTY_DIAGNOSTICS);
}
@ -760,6 +771,8 @@ public void testAllocatedToKilled() {
new RMAppAttemptEvent(
applicationAttempt.getAppAttemptId(),
RMAppAttemptEventType.KILL));
assertEquals(YarnApplicationAttemptState.ALLOCATED,
applicationAttempt.createApplicationAttemptState());
testAppAttemptKilledState(amContainer, EMPTY_DIAGNOSTICS);
}
@ -771,6 +784,8 @@ public void testAllocatedToFailed() {
new RMAppAttemptLaunchFailedEvent(
applicationAttempt.getAppAttemptId(),
diagnostics));
assertEquals(YarnApplicationAttemptState.ALLOCATED,
applicationAttempt.createApplicationAttemptState());
testAppAttemptFailedState(amContainer, diagnostics);
}
@ -784,6 +799,8 @@ public void testAMCrashAtAllocated() {
ContainerState.COMPLETE, containerDiagMsg, exitCode);
applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
applicationAttempt.getAppAttemptId(), cs));
assertEquals(YarnApplicationAttemptState.ALLOCATED,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.FAILED,
applicationAttempt.getAppAttemptState());
@ -815,7 +832,8 @@ public void testRunningToFailed() {
applicationAttempt.getAppAttemptId(), RMAppAttemptEventType.EXPIRE));
assertEquals(RMAppAttemptState.FINAL_SAVING,
applicationAttempt.getAppAttemptState());
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.FAILED,
applicationAttempt.getAppAttemptState());
@ -850,7 +868,8 @@ public void testRunningToKilled() {
applicationAttempt.getAppAttemptId(), RMAppAttemptEventType.EXPIRE));
assertEquals(RMAppAttemptState.FINAL_SAVING,
applicationAttempt.getAppAttemptState());
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.KILLED,
applicationAttempt.getAppAttemptState());
@ -871,6 +890,8 @@ public void testLaunchedExpire() {
launchApplicationAttempt(amContainer);
applicationAttempt.handle(new RMAppAttemptEvent(
applicationAttempt.getAppAttemptId(), RMAppAttemptEventType.EXPIRE));
assertEquals(YarnApplicationAttemptState.LAUNCHED,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.FAILED,
applicationAttempt.getAppAttemptState());
@ -890,6 +911,8 @@ public void testRunningExpire() {
runApplicationAttempt(amContainer, "host", 8042, "oldtrackingurl", false);
applicationAttempt.handle(new RMAppAttemptEvent(
applicationAttempt.getAppAttemptId(), RMAppAttemptEventType.EXPIRE));
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.FAILED,
applicationAttempt.getAppAttemptState());
@ -1055,6 +1078,8 @@ public void testSuccessfulFinishingToFinished() {
diagnostics));
assertEquals(RMAppAttemptState.FINAL_SAVING,
applicationAttempt.getAppAttemptState());
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
// Container_finished event comes before Attempt_Saved event.
applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
applicationAttempt.getAppAttemptId(), BuilderUtils.newContainerStatus(
@ -1083,6 +1108,8 @@ public void testFinalSavingToFinishedWithExpire() {
diagnostics));
assertEquals(RMAppAttemptState.FINAL_SAVING,
applicationAttempt.getAppAttemptState());
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
// Expire event comes before Attempt_saved event.
applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt
.getAppAttemptId(), RMAppAttemptEventType.EXPIRE));
@ -1118,6 +1145,8 @@ public void testGetClientToken() throws Exception {
applicationAttempt.handle(new RMAppAttemptEvent(applicationAttempt
.getAppAttemptId(), RMAppAttemptEventType.KILL));
assertEquals(YarnApplicationAttemptState.LAUNCHED,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
// after attempt is killed, can not get Client Token
token = applicationAttempt.createClientToken(null);
@ -1140,6 +1169,8 @@ public void testFailedToFailed() {
ApplicationAttemptId appAttemptId = applicationAttempt.getAppAttemptId();
applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
appAttemptId, cs1));
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.FAILED,
applicationAttempt.getAppAttemptState());
@ -1178,6 +1209,8 @@ scheduler, masterService, submissionContext, new Configuration(),
ApplicationAttemptId appAttemptId = applicationAttempt.getAppAttemptId();
applicationAttempt.handle(new RMAppAttemptContainerFinishedEvent(
appAttemptId, cs1));
assertEquals(YarnApplicationAttemptState.RUNNING,
applicationAttempt.createApplicationAttemptState());
sendAttemptUpdateSavedEvent(applicationAttempt);
assertEquals(RMAppAttemptState.FAILED,
applicationAttempt.getAppAttemptState());

View File

@ -35,7 +35,7 @@ Usage: yarn [--config confdir] COMMAND
*---------------+--------------+
|| COMMAND_OPTIONS || Description |
*---------------+--------------+
| --config confdir | Overwrites the default Configuration directory. Default is ${HADOOP_PREFIX}/conf. |
| --config confdir | Overwrites the default Configuration directory. Default is $\{HADOOP_PREFIX\}/conf. |
*---------------+--------------+
| COMMAND COMMAND_OPTIONS | Various commands with their options are described in the following sections. The commands have been grouped into {{User Commands}} and {{Administration Commands}}. |
*---------------+--------------+
@ -63,11 +63,22 @@ Usage: yarn [--config confdir] COMMAND
*---------------+--------------+
|| COMMAND_OPTIONS || Description |
*---------------+--------------+
| -status ApplicationId | Specify an application id |
| -list | Lists applications from the RM. Supports optional use of -appTypes
| | to filter applications based on application type, and -appStates to
| | filter applications based on application state.
*---------------+--------------+
| -list | Lists all the Applications from RM |
| -appStates States | Works with -list to filter applications based on input
| | comma-separated list of application states. The valid
| | application state can be one of the following: \
| | ALL, NEW, NEW_SAVING, SUBMITTED, ACCEPTED, RUNNING,
| | FINISHED, FAILED, KILLED
*---------------+--------------+
| -kill ApplicationId | Specify an application id |
| -appTypes Types | Works with -list to filter applications based on input
| | comma-separated list of application types.
*---------------+--------------+
| -status ApplicationId | Prints the status of the application.
*---------------+--------------+
| -kill ApplicationId | Kills the application.
*---------------+--------------+
** node
@ -81,9 +92,15 @@ Usage: yarn [--config confdir] COMMAND
*---------------+--------------+
|| COMMAND_OPTIONS || Description |
*---------------+--------------+
| -status NodeId | Specify a node id |
| -list | Lists all running nodes. Supports optional use of -states to filter
| | nodes based on node state, and -all to list all nodes.
*---------------+--------------+
| -list | Lists all the Nodes |
| -states States | Works with -list to filter nodes based on input
| | comma-separated list of node states.
*---------------+--------------+
| -all | Works with -list to list all nodes.
*---------------+--------------+
| -status NodeId | Prints the status report of the node.
*---------------+--------------+
** logs
@ -91,19 +108,22 @@ Usage: yarn [--config confdir] COMMAND
Dump the container logs
-------
Usage: yarn logs <options>
Usage: yarn logs -applicationId <application ID> <options>
-------
*---------------+--------------+
|| COMMAND_OPTIONS || Description |
*---------------+--------------+
| -applicationId ApplicationId | Specify an application id |
| -applicationId \<application ID\> | Specifies an application id |
*---------------+--------------+
| -appOwner AppOwner | Specify an application owner |
| -appOwner AppOwner | AppOwner (assumed to be current user if not
| | specified)
*---------------+--------------+
| -containerId ContainerId | Specify a container id |
| -containerId ContainerId | ContainerId (must be specified if node address is
| | specified)
*---------------+--------------+
| -nodeAddress NodeAddress | Specify a node address |
| -nodeAddress NodeAddress | NodeAddress in the format nodename:port (must be
| | specified if container id is specified)
*---------------+--------------+
** classpath
@ -158,7 +178,11 @@ Usage: yarn [--config confdir] COMMAND
-------
Usage: yarn rmadmin [-refreshQueues] [-refreshNodes] [-refreshUserToGroupsMapping]
[-refreshSuperUserGroupsConfiguration] [-refreshAdminAcls]
[-refreshServiceAcl] [-help [cmd]]
[-refreshServiceAcl] [-getGroups [username]] [-help [cmd]]
[-transitionToActive <serviceId>]
[-transitionToStandby <serviceId>]
[-getServiceState <serviceId>]
[-checkHealth <serviceId>]
-------
*---------------+--------------+
@ -176,8 +200,22 @@ Usage: yarn [--config confdir] COMMAND
*---------------+--------------+
| -refreshServiceAcl | Reload the service-level authorization policy file ResoureceManager will reload the authorization policy file. |
*---------------+--------------+
| -getGroups [username] | Get groups the specified user belongs to.
*---------------+--------------+
| -help [cmd] | Displays help for the given command or all commands if none is specified. |
*---------------+--------------+
| -transitionToActive \<serviceId\> | Transitions the service into Active
| | state.
*---------------+--------------+
| -transitionToStandby \<serviceId\> | Transitions the service into Standby
| | state.
*---------------+--------------+
| -getServiceState \<serviceId\> | Returns the state of the service.
*---------------+--------------+
| -checkHealth \<serviceId\> | Requests that the service perform a health
| | check. The RMAdmin tool will exit with a
| | non-zero exit code if the check fails.
*---------------+--------------+
** daemonlog
@ -191,9 +229,9 @@ Usage: yarn [--config confdir] COMMAND
*---------------+--------------+
|| COMMAND_OPTIONS || Description |
*---------------+--------------+
| -getlevel <host:port> <name> | Prints the log level of the daemon running at <host:port>. This command internally connects to http://<host:port>/logLevel?log=<name> |
| -getlevel \<host:port\> \<name\> | Prints the log level of the daemon running at \<host:port\>. This command internally connects to http://\<host:port\>/logLevel?log=\<name\>
*---------------+--------------+
| -setlevel <host:port> <name> <level> | Sets the log level of the daemon running at <host:port>. This command internally connects to http://<host:port>/logLevel?log=<name> |
| -setlevel \<host:port\> \<name\> \<level\> | Sets the log level of the daemon running at \<host:port\>. This command internally connects to http://\<host:port\>/logLevel?log=\<name\>
*---------------+--------------+