HDFS-5050. Add DataNode support for mlock and munlock (contributed by Andrew Wang)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-4949@1517106 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Colin McCabe 2013-08-24 03:41:25 +00:00
parent 920b4cc06f
commit b992219fa1
17 changed files with 1027 additions and 2 deletions

View File

@ -15,6 +15,9 @@ HDFS-4949 (Unreleased)
HDFS-5052. Add cacheRequest/uncacheRequest support to NameNode.
(contributed by Colin Patrick McCabe)
HDFS-5050. Add DataNode support for mlock and munlock
(Andrew Wang via Colin Patrick McCabe)
OPTIMIZATIONS
BUG FIXES

View File

@ -100,6 +100,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final boolean DFS_DATANODE_USE_DN_HOSTNAME_DEFAULT = false;
public static final String DFS_DATANODE_MAX_LOCKED_MEMORY_KEY = "dfs.datanode.max.locked.memory";
public static final long DFS_DATANODE_MAX_LOCKED_MEMORY_DEFAULT = 0;
public static final String DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_KEY = "dfs.datanode.fsdatasetcache.max.threads.per.volume";
public static final int DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_DEFAULT = 4;
public static final String DFS_NAMENODE_HTTP_PORT_KEY = "dfs.http.port";
public static final int DFS_NAMENODE_HTTP_PORT_DEFAULT = 50070;

View File

@ -549,6 +549,14 @@ class BPOfferService {
}
dn.metrics.incrBlocksRemoved(toDelete.length);
break;
case DatanodeProtocol.DNA_CACHE:
LOG.info("DatanodeCommand action: DNA_CACHE");
dn.getFSDataset().cache(bcmd.getBlockPoolId(), bcmd.getBlocks());
break;
case DatanodeProtocol.DNA_UNCACHE:
LOG.info("DatanodeCommand action: DNA_UNCACHE");
dn.getFSDataset().uncache(bcmd.getBlockPoolId(), bcmd.getBlocks());
break;
case DatanodeProtocol.DNA_SHUTDOWN:
// TODO: DNA_SHUTDOWN appears to be unused - the NN never sends this command
// See HDFS-2987.

View File

@ -449,6 +449,10 @@ class BPServiceActor implements Runnable {
long startTime = Time.monotonicNow();
if (startTime - lastCacheReport > dnConf.cacheReportInterval) {
// TODO: Implement me!
String bpid = bpos.getBlockPoolId();
BlockListAsLongs blocks = dn.getFSDataset().getCacheReport(bpid);
cmd = bpNamenode.cacheReport(bpRegistration, bpid,
blocks.getBlockListAsLongs());
}
return cmd;
}

View File

@ -160,4 +160,8 @@ public class DNConf {
public long getXceiverStopTimeout() {
return xceiverStopTimeout;
}
public long getMaxLockedMemory() {
return maxLockedMemory;
}
}

View File

@ -269,6 +269,14 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
*/
public BlockListAsLongs getBlockReport(String bpid);
/**
* Returns the cache report - the full list of cached blocks of a
* block pool
* @param bpid Block Pool Id
* @return - the cache report - the full list of cached blocks
*/
public BlockListAsLongs getCacheReport(String bpid);
/** Does the dataset contain the block? */
public boolean contains(ExtendedBlock block);
@ -294,6 +302,20 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
*/
public void invalidate(String bpid, Block invalidBlks[]) throws IOException;
/**
* Caches the specified blocks
* @param bpid Block pool id
* @param cacheBlks - block to cache
*/
public void cache(String bpid, Block[] cacheBlks);
/**
* Uncaches the specified blocks
* @param bpid Block pool id
* @param uncacheBlks - blocks to uncache
*/
public void uncache(String bpid, Block[] uncacheBlks);
/**
* Check if all the data directories are healthy
* @throws DiskErrorException

View File

@ -0,0 +1,240 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* Manages caching for an FsDatasetImpl by using the mmap(2) and mlock(2)
* system calls to lock blocks into memory. Block checksums are verified upon
* entry into the cache.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
public class FsDatasetCache {
private static final Log LOG = LogFactory.getLog(FsDatasetCache.class);
/**
* Map of cached blocks
*/
private final ConcurrentMap<Long, MappableBlock> cachedBlocks;
private final FsDatasetImpl dataset;
/**
* Number of cached bytes
*/
private AtomicLong usedBytes;
/**
* Total cache capacity in bytes
*/
private final long maxBytes;
public FsDatasetCache(FsDatasetImpl dataset) {
this.dataset = dataset;
this.cachedBlocks = new ConcurrentHashMap<Long, MappableBlock>();
this.usedBytes = new AtomicLong(0);
this.maxBytes = dataset.datanode.getDnConf().getMaxLockedMemory();
}
/**
* @return if the block is cached
*/
boolean isCached(String bpid, Block block) {
MappableBlock mapBlock = cachedBlocks.get(block.getBlockId());
if (mapBlock != null) {
return mapBlock.getBlockPoolId().equals(bpid);
}
return false;
}
/**
* @return List of cached blocks suitable for translation into a
* {@link BlockListAsLongs} for a cache report.
*/
List<Block> getCachedBlocks(String bpid) {
List<Block> blocks = new ArrayList<Block>();
MappableBlock mapBlock = null;
// ConcurrentHashMap iteration doesn't see latest updates, which is okay
for (Iterator<MappableBlock> it = cachedBlocks.values().iterator();
it.hasNext(); mapBlock = it.next()) {
if (mapBlock.getBlockPoolId().equals(bpid)) {
blocks.add(mapBlock.getBlock());
}
}
return blocks;
}
/**
* Asynchronously attempts to cache a block. This is subject to the
* configured maximum locked memory limit.
*
* @param block block to cache
* @param volume volume of the block
* @param blockIn stream of the block's data file
* @param metaIn stream of the block's meta file
*/
void cacheBlock(String bpid, Block block, FsVolumeImpl volume,
FileInputStream blockIn, FileInputStream metaIn) {
if (isCached(bpid, block)) {
return;
}
MappableBlock mapBlock = null;
try {
mapBlock = new MappableBlock(bpid, block, volume, blockIn, metaIn);
} catch (IOException e) {
LOG.warn("Failed to cache replica " + block + ": Could not instantiate"
+ " MappableBlock", e);
IOUtils.closeQuietly(blockIn);
IOUtils.closeQuietly(metaIn);
return;
}
// Check if there's sufficient cache capacity
boolean success = false;
long bytes = mapBlock.getNumBytes();
long used = usedBytes.get();
while (used+bytes < maxBytes) {
if (usedBytes.compareAndSet(used, used+bytes)) {
success = true;
break;
}
used = usedBytes.get();
}
if (!success) {
LOG.warn(String.format(
"Failed to cache replica %s: %s exceeded (%d + %d > %d)",
mapBlock.getBlock().toString(),
DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,
used, bytes, maxBytes));
mapBlock.close();
return;
}
// Submit it to the worker pool to be cached
volume.getExecutor().execute(new WorkerTask(mapBlock));
}
/**
* Uncaches a block if it is cached.
* @param block to uncache
*/
void uncacheBlock(String bpid, Block block) {
MappableBlock mapBlock = cachedBlocks.get(block.getBlockId());
if (mapBlock != null &&
mapBlock.getBlockPoolId().equals(bpid) &&
mapBlock.getBlock().equals(block)) {
mapBlock.close();
cachedBlocks.remove(mapBlock);
long bytes = mapBlock.getNumBytes();
long used = usedBytes.get();
while (!usedBytes.compareAndSet(used, used - bytes)) {
used = usedBytes.get();
}
}
}
/**
* Background worker that mmaps, mlocks, and checksums a block
*/
private class WorkerTask implements Runnable {
private MappableBlock block;
WorkerTask(MappableBlock block) {
this.block = block;
}
@Override
public void run() {
boolean success = false;
try {
block.map();
block.lock();
block.verifyChecksum();
success = true;
} catch (ChecksumException e) {
// Exception message is bogus since this wasn't caused by a file read
LOG.warn("Failed to cache block " + block.getBlock() + ": Checksum "
+ "verification failed.");
} catch (IOException e) {
LOG.warn("Failed to cache block " + block.getBlock() + ": IOException",
e);
}
// If we failed or the block became uncacheable in the meantime,
// clean up and return the reserved cache allocation
if (!success ||
!dataset.validToCache(block.getBlockPoolId(), block.getBlock())) {
block.close();
long used = usedBytes.get();
while (!usedBytes.compareAndSet(used, used-block.getNumBytes())) {
used = usedBytes.get();
}
} else {
cachedBlocks.put(block.getBlock().getBlockId(), block);
}
}
}
// Stats related methods for FsDatasetMBean
public long getCacheUsed() {
return usedBytes.get();
}
public long getCacheCapacity() {
return maxBytes;
}
public long getCacheRemaining() {
return maxBytes - usedBytes.get();
}
}

View File

@ -37,6 +37,7 @@ import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.StandardMBean;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
@ -170,6 +171,7 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
final FsVolumeList volumes;
final ReplicaMap volumeMap;
final FsDatasetAsyncDiskService asyncDiskService;
final FsDatasetCache cacheManager;
private final int validVolsRequired;
// Used for synchronizing access to usage stats
@ -228,6 +230,7 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
roots[idx] = storage.getStorageDir(idx).getCurrentDir();
}
asyncDiskService = new FsDatasetAsyncDiskService(datanode, roots);
cacheManager = new FsDatasetCache(this);
registerMBean(storage.getStorageID());
}
@ -287,6 +290,30 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
return volumes.numberOfFailedVolumes();
}
/**
* Returns the total cache used by the datanode (in bytes).
*/
@Override // FSDatasetMBean
public long getCacheUsed() {
return cacheManager.getCacheUsed();
}
/**
* Returns the total cache capacity of the datanode (in bytes).
*/
@Override // FSDatasetMBean
public long getCacheCapacity() {
return cacheManager.getCacheCapacity();
}
/**
* Returns the total amount of cache remaining (in bytes).
*/
@Override // FSDatasetMBean
public long getCacheRemaining() {
return cacheManager.getCacheRemaining();
}
/**
* Find the block's on-disk length
*/
@ -534,6 +561,8 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
private synchronized ReplicaBeingWritten append(String bpid,
FinalizedReplica replicaInfo, long newGS, long estimateBlockLen)
throws IOException {
// uncache the block
cacheManager.uncacheBlock(bpid, replicaInfo);
// unlink the finalized replica
replicaInfo.unlinkBlock(1);
@ -1001,6 +1030,11 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
}
}
@Override // FsDatasetSpi
public BlockListAsLongs getCacheReport(String bpid) {
return new BlockListAsLongs(cacheManager.getCachedBlocks(bpid), null);
}
/**
* Get the list of finalized blocks from in-memory blockmap for a block pool.
*/
@ -1143,6 +1177,8 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
volumeMap.remove(bpid, invalidBlks[i]);
}
// Uncache the block synchronously
cacheManager.uncacheBlock(bpid, invalidBlks[i]);
// Delete the block asynchronously to make sure we can do it fast enough
asyncDiskService.deleteAsync(v, f,
FsDatasetUtil.getMetaFile(f, invalidBlks[i].getGenerationStamp()),
@ -1153,6 +1189,78 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
}
}
synchronized boolean validToCache(String bpid, Block blk) {
ReplicaInfo info = volumeMap.get(bpid, blk);
if (info == null) {
LOG.warn("Failed to cache replica " + blk + ": ReplicaInfo not found.");
return false;
}
FsVolumeImpl volume = (FsVolumeImpl)info.getVolume();
if (volume == null) {
LOG.warn("Failed to cache replica " + blk + ": Volume not found.");
return false;
}
if (info.getState() != ReplicaState.FINALIZED) {
LOG.warn("Failed to cache replica " + blk + ": Replica is not"
+ " finalized.");
return false;
}
return true;
}
/**
* Asynchronously attempts to cache a single block via {@link FsDatasetCache}.
*/
private void cacheBlock(String bpid, Block blk) {
ReplicaInfo info;
FsVolumeImpl volume;
synchronized (this) {
if (!validToCache(bpid, blk)) {
return;
}
info = volumeMap.get(bpid, blk);
volume = (FsVolumeImpl)info.getVolume();
}
// Try to open block and meta streams
FileInputStream blockIn = null;
FileInputStream metaIn = null;
boolean success = false;
try {
ExtendedBlock extBlk = new ExtendedBlock(bpid, blk);
blockIn = (FileInputStream)getBlockInputStream(extBlk, 0);
metaIn = (FileInputStream)getMetaDataInputStream(extBlk)
.getWrappedStream();
success = true;
} catch (ClassCastException e) {
LOG.warn("Failed to cache replica " + blk + ": Underlying blocks"
+ " are not backed by files.", e);
} catch (IOException e) {
LOG.warn("Failed to cache replica " + blk + ": IOException while"
+ " trying to open block or meta files.", e);
}
if (!success) {
IOUtils.closeQuietly(blockIn);
IOUtils.closeQuietly(metaIn);
return;
}
cacheManager.cacheBlock(bpid, blk, volume, blockIn, metaIn);
}
@Override // FsDatasetSpi
public void cache(String bpid, Block[] cacheBlks) {
for (int i=0; i<cacheBlks.length; i++) {
cacheBlock(bpid, cacheBlks[i]);
}
}
@Override // FsDatasetSpi
public void uncache(String bpid, Block[] uncacheBlks) {
for (int i=0; i<uncacheBlks.length; i++) {
Block blk = uncacheBlks[i];
cacheManager.uncacheBlock(bpid, blk);
}
}
@Override // FsDatasetSpi
public synchronized boolean contains(final ExtendedBlock block) {
final long blockId = block.getLocalBlock().getBlockId();

View File

@ -18,11 +18,17 @@
package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.Executor;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
@ -34,6 +40,8 @@ import org.apache.hadoop.hdfs.server.datanode.DataStorage;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* The underlying volume used to store replica.
*
@ -48,6 +56,13 @@ class FsVolumeImpl implements FsVolumeSpi {
private final File currentDir; // <StorageDirectory>/current
private final DF usage;
private final long reserved;
/**
* Per-volume worker pool that processes new blocks to cache.
* The maximum number of workers per volume is bounded (configurable via
* dfs.datanode.fsdatasetcache.max.threads.per.volume) to limit resource
* contention.
*/
private final ThreadPoolExecutor cacheExecutor;
FsVolumeImpl(FsDatasetImpl dataset, String storageID, File currentDir,
Configuration conf) throws IOException {
@ -59,6 +74,20 @@ class FsVolumeImpl implements FsVolumeSpi {
this.currentDir = currentDir;
File parent = currentDir.getParentFile();
this.usage = new DF(parent, conf);
final int maxNumThreads = dataset.datanode.getConf().getInt(
DFSConfigKeys.DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_KEY,
DFSConfigKeys.DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_DEFAULT
);
ThreadFactory workerFactory = new ThreadFactoryBuilder()
.setDaemon(true)
.setNameFormat("FsVolumeImplWorker-" + parent.toString() + "-%d")
.build();
cacheExecutor = new ThreadPoolExecutor(
1, maxNumThreads,
60, TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>(),
workerFactory);
cacheExecutor.allowCoreThreadTimeOut(true);
}
File getCurrentDir() {
@ -167,6 +196,10 @@ class FsVolumeImpl implements FsVolumeSpi {
return getBlockPoolSlice(bpid).addBlock(b, f);
}
Executor getExecutor() {
return cacheExecutor;
}
void checkDirs() throws DiskErrorException {
// TODO:FEDERATION valid synchronization
for(BlockPoolSlice s : bpSlices.values()) {
@ -210,6 +243,7 @@ class FsVolumeImpl implements FsVolumeSpi {
}
void shutdown() {
cacheExecutor.shutdown();
Set<Entry<String, BlockPoolSlice>> set = bpSlices.entrySet();
for (Entry<String, BlockPoolSlice> entry : set) {
entry.getValue().shutdown();

View File

@ -0,0 +1,249 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode.fsdataset.impl;
import java.io.BufferedInputStream;
import java.io.Closeable;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.util.DataChecksum;
import com.google.common.base.Preconditions;
/**
* Low-level wrapper for a Block and its backing files that provides mmap,
* mlock, and checksum verification operations.
*
* This could be a private class of FsDatasetCache, not meant for other users.
*/
@InterfaceAudience.Private
@InterfaceStability.Unstable
class MappableBlock implements Closeable {
private final String bpid;
private final Block block;
private final FsVolumeImpl volume;
private final FileInputStream blockIn;
private final FileInputStream metaIn;
private final FileChannel blockChannel;
private final FileChannel metaChannel;
private final long blockSize;
private boolean isMapped;
private boolean isLocked;
private boolean isChecksummed;
private MappedByteBuffer blockMapped = null;
public MappableBlock(String bpid, Block blk, FsVolumeImpl volume,
FileInputStream blockIn, FileInputStream metaIn) throws IOException {
this.bpid = bpid;
this.block = blk;
this.volume = volume;
this.blockIn = blockIn;
this.metaIn = metaIn;
this.blockChannel = blockIn.getChannel();
this.metaChannel = metaIn.getChannel();
this.blockSize = blockChannel.size();
this.isMapped = false;
this.isLocked = false;
this.isChecksummed = false;
}
public String getBlockPoolId() {
return bpid;
}
public Block getBlock() {
return block;
}
public FsVolumeImpl getVolume() {
return volume;
}
public boolean isMapped() {
return isMapped;
}
public boolean isLocked() {
return isLocked;
}
public boolean isChecksummed() {
return isChecksummed;
}
/**
* Returns the number of bytes on disk for the block file
*/
public long getNumBytes() {
return blockSize;
}
/**
* Maps the block into memory. See mmap(2).
*/
public void map() throws IOException {
if (isMapped) {
return;
}
blockMapped = blockChannel.map(MapMode.READ_ONLY, 0, blockSize);
isMapped = true;
}
/**
* Unmaps the block from memory. See munmap(2).
*/
public void unmap() {
if (!isMapped) {
return;
}
if (blockMapped instanceof sun.nio.ch.DirectBuffer) {
sun.misc.Cleaner cleaner =
((sun.nio.ch.DirectBuffer)blockMapped).cleaner();
cleaner.clean();
}
isMapped = false;
isLocked = false;
isChecksummed = false;
}
/**
* Locks the block into memory. This prevents the block from being paged out.
* See mlock(2).
*/
public void lock() throws IOException {
Preconditions.checkArgument(isMapped,
"Block must be mapped before it can be locked!");
if (isLocked) {
return;
}
NativeIO.POSIX.mlock(blockMapped, blockSize);
isLocked = true;
}
/**
* Unlocks the block from memory, allowing it to be paged out. See munlock(2).
*/
public void unlock() throws IOException {
if (!isLocked || !isMapped) {
return;
}
NativeIO.POSIX.munlock(blockMapped, blockSize);
isLocked = false;
isChecksummed = false;
}
/**
* Reads bytes into a buffer until EOF or the buffer's limit is reached
*/
private int fillBuffer(FileChannel channel, ByteBuffer buf)
throws IOException {
int bytesRead = channel.read(buf);
if (bytesRead < 0) {
//EOF
return bytesRead;
}
while (buf.remaining() > 0) {
int n = channel.read(buf);
if (n < 0) {
//EOF
return bytesRead;
}
bytesRead += n;
}
return bytesRead;
}
/**
* Verifies the block's checksum. This is an I/O intensive operation.
* @return if the block was successfully checksummed.
*/
public void verifyChecksum() throws IOException, ChecksumException {
Preconditions.checkArgument(isLocked && isMapped,
"Block must be mapped and locked before checksum verification!");
// skip if checksum has already been successfully verified
if (isChecksummed) {
return;
}
// Verify the checksum from the block's meta file
// Get the DataChecksum from the meta file header
metaChannel.position(0);
BlockMetadataHeader header =
BlockMetadataHeader.readHeader(new DataInputStream(
new BufferedInputStream(metaIn, BlockMetadataHeader
.getHeaderSize())));
DataChecksum checksum = header.getChecksum();
final int bytesPerChecksum = checksum.getBytesPerChecksum();
final int checksumSize = checksum.getChecksumSize();
final int numChunks = (8*1024*1024) / bytesPerChecksum;
ByteBuffer blockBuf = ByteBuffer.allocate(numChunks*bytesPerChecksum);
ByteBuffer checksumBuf = ByteBuffer.allocate(numChunks*checksumSize);
// Verify the checksum
int bytesVerified = 0;
while (bytesVerified < blockChannel.size()) {
Preconditions.checkState(bytesVerified % bytesPerChecksum == 0,
"Unexpected partial chunk before EOF");
assert bytesVerified % bytesPerChecksum == 0;
int bytesRead = fillBuffer(blockChannel, blockBuf);
if (bytesRead == -1) {
throw new IOException("Premature EOF");
}
blockBuf.flip();
// Number of read chunks, including partial chunk at end
int chunks = (bytesRead+bytesPerChecksum-1) / bytesPerChecksum;
checksumBuf.limit(chunks*bytesPerChecksum);
fillBuffer(metaChannel, checksumBuf);
checksumBuf.flip();
checksum.verifyChunkedSums(blockBuf, checksumBuf, block.getBlockName(),
bytesVerified);
// Success
bytesVerified += bytesRead;
blockBuf.clear();
checksumBuf.clear();
}
isChecksummed = true;
// Can close the backing file since everything is safely in memory
blockChannel.close();
}
@Override
public void close() {
unmap();
IOUtils.closeQuietly(blockIn);
IOUtils.closeQuietly(metaIn);
}
}

View File

@ -77,4 +77,19 @@ public interface FSDatasetMBean {
* @return The number of failed volumes in the datanode.
*/
public int getNumFailedVolumes();
/**
* Returns the total cache used by the datanode (in bytes).
*/
public long getCacheUsed();
/**
* Returns the total cache capacity of the datanode (in bytes).
*/
public long getCacheCapacity();
/**
* Returns the total amount of cache remaining (in bytes).
*/
public long getCacheRemaining();
}

View File

@ -74,6 +74,8 @@ public interface DatanodeProtocol {
final static int DNA_RECOVERBLOCK = 6; // request a block recovery
final static int DNA_ACCESSKEYUPDATE = 7; // update access key
final static int DNA_BALANCERBANDWIDTHUPDATE = 8; // update balancer bandwidth
final static int DNA_CACHE = 9; // cache blocks
final static int DNA_UNCACHE = 10; // uncache blocks
/**
* Register Datanode.

View File

@ -102,6 +102,8 @@ message BlockCommandProto {
TRANSFER = 1; // Transfer blocks to another datanode
INVALIDATE = 2; // Invalidate blocks
SHUTDOWN = 3; // Shutdown the datanode
CACHE = 4; // Cache blocks on the datanode
UNCACHE = 5; // Uncache blocks on the datanode
}
required Action action = 1;
required string blockPoolId = 2;

View File

@ -1419,4 +1419,27 @@
</description>
</property>
<property>
<name>dfs.datanode.max.locked.memory</name>
<value>0</value>
<description>
The amount of memory in bytes to use for caching of block replicas in
memory on the datanode. The datanode's maximum locked memory soft ulimit
(RLIMIT_MEMLOCK) must be set to at least this value, else the datanode
will abort on startup.
By default, this parameter set to 0, which disables in-memory caching.
</description>
</property>
<property>
<name>dfs.datanode.fsdatasetcache.max.threads.per.volume</name>
<value>4</value>
<description>
The maximum number of threads per volume to use for caching new data
on the datanode. These threads consume both I/O and CPU. This can affect
normal datanode operations.
</description>
</property>
</configuration>

View File

@ -61,4 +61,15 @@ public class LogVerificationAppender extends AppenderSkeleton {
}
return count;
}
public int countLinesWithMessage(final String text) {
int count = 0;
for (LoggingEvent e: getLog()) {
String msg = e.getRenderedMessage();
if (msg != null && msg.contains(text)) {
count++;
}
}
return count;
}
}

View File

@ -465,6 +465,11 @@ public class SimulatedFSDataset implements FsDatasetSpi<FsVolumeSpi> {
return new BlockListAsLongs(blocks, null);
}
@Override // FsDatasetSpi
public BlockListAsLongs getCacheReport(String bpid) {
return new BlockListAsLongs();
}
@Override // FSDatasetMBean
public long getCapacity() {
return storage.getCapacity();
@ -490,6 +495,21 @@ public class SimulatedFSDataset implements FsDatasetSpi<FsVolumeSpi> {
return storage.getNumFailedVolumes();
}
@Override // FSDatasetMBean
public long getCacheUsed() {
return 0l;
}
@Override // FSDatasetMBean
public long getCacheCapacity() {
return 0l;
}
@Override // FSDatasetMBean
public long getCacheRemaining() {
return 0l;
}
@Override // FsDatasetSpi
public synchronized long getLength(ExtendedBlock b) throws IOException {
final Map<Block, BInfo> map = getMap(b.getBlockPoolId());
@ -559,6 +579,18 @@ public class SimulatedFSDataset implements FsDatasetSpi<FsVolumeSpi> {
}
}
@Override // FSDatasetSpi
public void cache(String bpid, Block[] cacheBlks) {
throw new UnsupportedOperationException(
"SimulatedFSDataset does not support cache operation!");
}
@Override // FSDatasetSpi
public void uncache(String bpid, Block[] uncacheBlks) {
throw new UnsupportedOperationException(
"SimulatedFSDataset does not support uncache operation!");
}
private BInfo getBInfo(final ExtendedBlock b) {
final Map<Block, BInfo> map = blockMap.get(b.getBlockPoolId());
return map == null? null: map.get(b.getLocalBlock());

View File

@ -0,0 +1,266 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import static org.junit.Assert.assertEquals;
import static org.mockito.Matchers.*;
import static org.mockito.Mockito.doReturn;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.channels.FileChannel;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.HdfsBlockLocation;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.LogVerificationAppender;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.server.namenode.FSImage;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse;
import org.apache.hadoop.hdfs.server.protocol.NNHAStatusHeartbeat;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.log4j.Logger;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
public class TestFsDatasetCache {
// Most Linux installs allow a default of 64KB locked memory
private static final long CACHE_CAPACITY = 64 * 1024;
private static final long BLOCK_SIZE = 4096;
private static Configuration conf;
private static MiniDFSCluster cluster = null;
private static FileSystem fs;
private static NameNode nn;
private static FSImage fsImage;
private static DataNode dn;
private static FsDatasetSpi<?> fsd;
private static DatanodeProtocolClientSideTranslatorPB spyNN;
@Before
public void setUp() throws Exception {
conf = new HdfsConfiguration();
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
conf.setLong(DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,
CACHE_CAPACITY);
conf.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
cluster = new MiniDFSCluster.Builder(conf)
.numDataNodes(1).build();
cluster.waitActive();
fs = cluster.getFileSystem();
nn = cluster.getNameNode();
fsImage = nn.getFSImage();
dn = cluster.getDataNodes().get(0);
fsd = dn.getFSDataset();
spyNN = DataNodeTestUtils.spyOnBposToNN(dn, nn);
}
@After
public void tearDown() throws Exception {
if (cluster != null) {
cluster.shutdown();
}
}
private static void setHeartbeatResponse(DatanodeCommand[] cmds)
throws IOException {
HeartbeatResponse response = new HeartbeatResponse(
cmds,
new NNHAStatusHeartbeat(HAServiceState.ACTIVE,
fsImage.getLastAppliedOrWrittenTxId()));
doReturn(response).when(spyNN).sendHeartbeat(
(DatanodeRegistration) any(),
(StorageReport[]) any(),
anyInt(), anyInt(), anyInt());
}
private static DatanodeCommand[] cacheBlock(HdfsBlockLocation loc) {
return cacheBlocks(new HdfsBlockLocation[] {loc});
}
private static DatanodeCommand[] cacheBlocks(HdfsBlockLocation[] locs) {
return new DatanodeCommand[] {
getResponse(locs, DatanodeProtocol.DNA_CACHE)
};
}
private static DatanodeCommand[] uncacheBlock(HdfsBlockLocation loc) {
return uncacheBlocks(new HdfsBlockLocation[] {loc});
}
private static DatanodeCommand[] uncacheBlocks(HdfsBlockLocation[] locs) {
return new DatanodeCommand[] {
getResponse(locs, DatanodeProtocol.DNA_UNCACHE)
};
}
/**
* Creates a cache or uncache DatanodeCommand from an array of locations
*/
private static DatanodeCommand getResponse(HdfsBlockLocation[] locs,
int action) {
String bpid = locs[0].getLocatedBlock().getBlock().getBlockPoolId();
Block[] blocks = new Block[locs.length];
for (int i=0; i<locs.length; i++) {
blocks[i] = locs[i].getLocatedBlock().getBlock().getLocalBlock();
}
return new BlockCommand(action, bpid, blocks);
}
private static long[] getBlockSizes(HdfsBlockLocation[] locs)
throws Exception {
long[] sizes = new long[locs.length];
for (int i=0; i<locs.length; i++) {
HdfsBlockLocation loc = locs[i];
String bpid = loc.getLocatedBlock().getBlock().getBlockPoolId();
Block block = loc.getLocatedBlock().getBlock().getLocalBlock();
ExtendedBlock extBlock = new ExtendedBlock(bpid, block);
FileChannel blockChannel =
((FileInputStream)fsd.getBlockInputStream(extBlock, 0)).getChannel();
sizes[i] = blockChannel.size();
}
return sizes;
}
/**
* Blocks until cache usage changes from the current value, then verifies
* against the expected new value.
*/
private long verifyExpectedCacheUsage(final long current,
final long expected) throws Exception {
long cacheUsed = fsd.getCacheUsed();
while (cacheUsed == current) {
cacheUsed = fsd.getCacheUsed();
Thread.sleep(100);
}
long cacheCapacity = fsd.getCacheCapacity();
long cacheRemaining = fsd.getCacheRemaining();
assertEquals("Sum of used and remaining cache does not equal total",
cacheCapacity, cacheUsed+cacheRemaining);
assertEquals("Unexpected amount of cache used", expected, cacheUsed);
return cacheUsed;
}
@Test(timeout=60000)
public void testCacheAndUncacheBlock() throws Exception {
final int NUM_BLOCKS = 5;
// Write a test file
final Path testFile = new Path("/testCacheBlock");
final long testFileLen = BLOCK_SIZE*NUM_BLOCKS;
DFSTestUtil.createFile(fs, testFile, testFileLen, (short)1, 0xABBAl);
// Get the details of the written file
HdfsBlockLocation[] locs =
(HdfsBlockLocation[])fs.getFileBlockLocations(testFile, 0, testFileLen);
assertEquals("Unexpected number of blocks", NUM_BLOCKS, locs.length);
final long[] blockSizes = getBlockSizes(locs);
// Check initial state
final long cacheCapacity = fsd.getCacheCapacity();
long cacheUsed = fsd.getCacheUsed();
long current = 0;
assertEquals("Unexpected cache capacity", CACHE_CAPACITY, cacheCapacity);
assertEquals("Unexpected amount of cache used", current, cacheUsed);
// Cache each block in succession, checking each time
for (int i=0; i<NUM_BLOCKS; i++) {
setHeartbeatResponse(cacheBlock(locs[i]));
current = verifyExpectedCacheUsage(current, current + blockSizes[i]);
}
// Uncache each block in succession, again checking each time
for (int i=0; i<NUM_BLOCKS; i++) {
setHeartbeatResponse(uncacheBlock(locs[i]));
current = verifyExpectedCacheUsage(current, current - blockSizes[i]);
}
}
@Test(timeout=60000)
public void testFilesExceedMaxLockedMemory() throws Exception {
// Create some test files that will exceed total cache capacity
// Don't forget that meta files take up space too!
final int numFiles = 4;
final long fileSize = CACHE_CAPACITY / numFiles;
final Path[] testFiles = new Path[4];
final HdfsBlockLocation[][] fileLocs = new HdfsBlockLocation[numFiles][];
final long[] fileSizes = new long[numFiles];
for (int i=0; i<numFiles; i++) {
testFiles[i] = new Path("/testFilesExceedMaxLockedMemory-" + i);
DFSTestUtil.createFile(fs, testFiles[i], fileSize, (short)1, 0xDFAl);
fileLocs[i] = (HdfsBlockLocation[])fs.getFileBlockLocations(
testFiles[i], 0, fileSize);
// Get the file size (sum of blocks)
long[] sizes = getBlockSizes(fileLocs[i]);
for (int j=0; j<sizes.length; j++) {
fileSizes[i] += sizes[j];
}
}
// Cache the first n-1 files
long current = 0;
for (int i=0; i<numFiles-1; i++) {
setHeartbeatResponse(cacheBlocks(fileLocs[i]));
current = verifyExpectedCacheUsage(current, current + fileSizes[i]);
}
final long oldCurrent = current;
// nth file should hit a capacity exception
final LogVerificationAppender appender = new LogVerificationAppender();
final Logger logger = Logger.getRootLogger();
logger.addAppender(appender);
setHeartbeatResponse(cacheBlocks(fileLocs[numFiles-1]));
int lines = 0;
while (lines == 0) {
Thread.sleep(100);
lines = appender.countLinesWithMessage(
DFSConfigKeys.DFS_DATANODE_MAX_LOCKED_MEMORY_KEY + " exceeded");
}
// Uncache the cached part of the nth file
setHeartbeatResponse(uncacheBlocks(fileLocs[numFiles-1]));
while (fsd.getCacheUsed() != oldCurrent) {
Thread.sleep(100);
}
// Uncache the n-1 files
for (int i=0; i<numFiles-1; i++) {
setHeartbeatResponse(uncacheBlocks(fileLocs[i]));
current = verifyExpectedCacheUsage(current, current - fileSizes[i]);
}
}
}