HDFS-5708. The CacheManager throws a NPE in the DataNode logs when processing cache reports that refer to a block not known to the BlockManager. Contributed by Colin Patrick McCabe.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1554594 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
97e881b955
commit
07e4fb1455
|
@ -244,12 +244,14 @@ Trunk (Unreleased)
|
||||||
HDFS-5636. Enforce a max TTL per cache pool. (awang via cmccabe)
|
HDFS-5636. Enforce a max TTL per cache pool. (awang via cmccabe)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HDFS-5349. DNA_CACHE and DNA_UNCACHE should be by blockId only. (cmccabe)
|
HDFS-5349. DNA_CACHE and DNA_UNCACHE should be by blockId only. (cmccabe)
|
||||||
|
|
||||||
HDFS-5665. Remove the unnecessary writeLock while initializing CacheManager
|
HDFS-5665. Remove the unnecessary writeLock while initializing CacheManager
|
||||||
in FsNameSystem Ctor. (Uma Maheswara Rao G via Andrew Wang)
|
in FsNameSystem Ctor. (Uma Maheswara Rao G via Andrew Wang)
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
||||||
HADOOP-9635 Fix potential Stack Overflow in DomainSocket.c (V. Karthik Kumar
|
HADOOP-9635 Fix potential Stack Overflow in DomainSocket.c (V. Karthik Kumar
|
||||||
via cmccabe)
|
via cmccabe)
|
||||||
|
|
||||||
|
@ -456,6 +458,10 @@ Trunk (Unreleased)
|
||||||
HDFS-5701. Fix the CacheAdmin -addPool -maxTtl option name.
|
HDFS-5701. Fix the CacheAdmin -addPool -maxTtl option name.
|
||||||
(Stephen Chu via wang)
|
(Stephen Chu via wang)
|
||||||
|
|
||||||
|
HDFS-5708. The CacheManager throws a NPE in the DataNode logs when
|
||||||
|
processing cache reports that refer to a block not known to the
|
||||||
|
BlockManager. (cmccabe via wang)
|
||||||
|
|
||||||
BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
|
BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
|
||||||
|
|
||||||
HDFS-4985. Add storage type to the protocol and expose it in block report
|
HDFS-4985. Add storage type to the protocol and expose it in block report
|
||||||
|
|
|
@ -460,14 +460,21 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
||||||
directive.getReplication()) * blockInfo.getNumBytes();
|
directive.getReplication()) * blockInfo.getNumBytes();
|
||||||
cachedTotal += cachedByBlock;
|
cachedTotal += cachedByBlock;
|
||||||
|
|
||||||
if (mark != ocblock.getMark()) {
|
if ((mark != ocblock.getMark()) ||
|
||||||
// Mark hasn't been set in this scan, so update replication and mark.
|
(ocblock.getReplication() < directive.getReplication())) {
|
||||||
|
//
|
||||||
|
// Overwrite the block's replication and mark in two cases:
|
||||||
|
//
|
||||||
|
// 1. If the mark on the CachedBlock is different from the mark for
|
||||||
|
// this scan, that means the block hasn't been updated during this
|
||||||
|
// scan, and we should overwrite whatever is there, since it is no
|
||||||
|
// longer valid.
|
||||||
|
//
|
||||||
|
// 2. If the replication in the CachedBlock is less than what the
|
||||||
|
// directive asks for, we want to increase the block's replication
|
||||||
|
// field to what the directive asks for.
|
||||||
|
//
|
||||||
ocblock.setReplicationAndMark(directive.getReplication(), mark);
|
ocblock.setReplicationAndMark(directive.getReplication(), mark);
|
||||||
} else {
|
|
||||||
// Mark already set in this scan. Set replication to highest value in
|
|
||||||
// any CacheDirective that covers this file.
|
|
||||||
ocblock.setReplicationAndMark((short)Math.max(
|
|
||||||
directive.getReplication(), ocblock.getReplication()), mark);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -483,6 +490,36 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String findReasonForNotCaching(CachedBlock cblock,
|
||||||
|
BlockInfo blockInfo) {
|
||||||
|
if (blockInfo == null) {
|
||||||
|
// Somehow, a cache report with the block arrived, but the block
|
||||||
|
// reports from the DataNode haven't (yet?) described such a block.
|
||||||
|
// Alternately, the NameNode might have invalidated the block, but the
|
||||||
|
// DataNode hasn't caught up. In any case, we want to tell the DN
|
||||||
|
// to uncache this.
|
||||||
|
return "not tracked by the BlockManager";
|
||||||
|
} else if (!blockInfo.isComplete()) {
|
||||||
|
// When a cached block changes state from complete to some other state
|
||||||
|
// on the DataNode (perhaps because of append), it will begin the
|
||||||
|
// uncaching process. However, the uncaching process is not
|
||||||
|
// instantaneous, especially if clients have pinned the block. So
|
||||||
|
// there may be a period of time when incomplete blocks remain cached
|
||||||
|
// on the DataNodes.
|
||||||
|
return "not complete";
|
||||||
|
} else if (cblock.getReplication() == 0) {
|
||||||
|
// Since 0 is not a valid value for a cache directive's replication
|
||||||
|
// field, seeing a replication of 0 on a CacheBlock means that it
|
||||||
|
// has never been reached by any sweep.
|
||||||
|
return "not needed by any directives";
|
||||||
|
} else if (cblock.getMark() != mark) {
|
||||||
|
// Although the block was needed in the past, we didn't reach it during
|
||||||
|
// the current sweep. Therefore, it doesn't need to be cached any more.
|
||||||
|
return "no longer needed by any directives";
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scan through the cached block map.
|
* Scan through the cached block map.
|
||||||
* Any blocks which are under-replicated should be assigned new Datanodes.
|
* Any blocks which are under-replicated should be assigned new Datanodes.
|
||||||
|
@ -508,11 +545,17 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
||||||
iter.remove();
|
iter.remove();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// If the block's mark doesn't match with the mark of this scan, that
|
BlockInfo blockInfo = blockManager.
|
||||||
// means that this block couldn't be reached during this scan. That means
|
getStoredBlock(new Block(cblock.getBlockId()));
|
||||||
// it doesn't need to be cached any more.
|
String reason = findReasonForNotCaching(cblock, blockInfo);
|
||||||
int neededCached = (cblock.getMark() != mark) ?
|
int neededCached = 0;
|
||||||
0 : cblock.getReplication();
|
if (reason != null) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("not caching " + cblock + " because it is " + reason);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
neededCached = cblock.getReplication();
|
||||||
|
}
|
||||||
int numCached = cached.size();
|
int numCached = cached.size();
|
||||||
if (numCached >= neededCached) {
|
if (numCached >= neededCached) {
|
||||||
// If we have enough replicas, drop all pending cached.
|
// If we have enough replicas, drop all pending cached.
|
||||||
|
@ -612,8 +655,10 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
|
||||||
BlockInfo blockInfo = blockManager.
|
BlockInfo blockInfo = blockManager.
|
||||||
getStoredBlock(new Block(cachedBlock.getBlockId()));
|
getStoredBlock(new Block(cachedBlock.getBlockId()));
|
||||||
if (blockInfo == null) {
|
if (blockInfo == null) {
|
||||||
LOG.debug("Not caching block " + cachedBlock + " because it " +
|
if (LOG.isDebugEnabled()) {
|
||||||
"was deleted from all DataNodes.");
|
LOG.debug("Not caching block " + cachedBlock + " because there " +
|
||||||
|
"is no record of it on the NameNode.");
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!blockInfo.isComplete()) {
|
if (!blockInfo.isComplete()) {
|
||||||
|
|
|
@ -62,7 +62,6 @@ import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
|
||||||
import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
|
import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
|
||||||
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
|
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
|
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||||
|
@ -940,39 +939,28 @@ public final class CacheManager {
|
||||||
final List<Long> blockIds) {
|
final List<Long> blockIds) {
|
||||||
CachedBlocksList cached = datanode.getCached();
|
CachedBlocksList cached = datanode.getCached();
|
||||||
cached.clear();
|
cached.clear();
|
||||||
|
CachedBlocksList cachedList = datanode.getCached();
|
||||||
|
CachedBlocksList pendingCachedList = datanode.getPendingCached();
|
||||||
for (Iterator<Long> iter = blockIds.iterator(); iter.hasNext(); ) {
|
for (Iterator<Long> iter = blockIds.iterator(); iter.hasNext(); ) {
|
||||||
Block block = new Block(iter.next());
|
long blockId = iter.next();
|
||||||
BlockInfo blockInfo = blockManager.getStoredBlock(block);
|
|
||||||
if (!blockInfo.isComplete()) {
|
|
||||||
LOG.warn("Ignoring block id " + block.getBlockId() + ", because " +
|
|
||||||
"it is in not complete yet. It is in state " +
|
|
||||||
blockInfo.getBlockUCState());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
Collection<DatanodeDescriptor> corruptReplicas =
|
|
||||||
blockManager.getCorruptReplicas(blockInfo);
|
|
||||||
if ((corruptReplicas != null) && corruptReplicas.contains(datanode)) {
|
|
||||||
// The NameNode will eventually remove or update the corrupt block.
|
|
||||||
// Until then, we pretend that it isn't cached.
|
|
||||||
LOG.warn("Ignoring cached replica on " + datanode + " of " + block +
|
|
||||||
" because it is corrupt.");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
CachedBlock cachedBlock =
|
CachedBlock cachedBlock =
|
||||||
new CachedBlock(block.getBlockId(), (short)0, false);
|
new CachedBlock(blockId, (short)0, false);
|
||||||
CachedBlock prevCachedBlock = cachedBlocks.get(cachedBlock);
|
CachedBlock prevCachedBlock = cachedBlocks.get(cachedBlock);
|
||||||
// Use the existing CachedBlock if it's present; otherwise,
|
// Add the block ID from the cache report to the cachedBlocks map
|
||||||
// insert a new one.
|
// if it's not already there.
|
||||||
if (prevCachedBlock != null) {
|
if (prevCachedBlock != null) {
|
||||||
cachedBlock = prevCachedBlock;
|
cachedBlock = prevCachedBlock;
|
||||||
} else {
|
} else {
|
||||||
cachedBlocks.put(cachedBlock);
|
cachedBlocks.put(cachedBlock);
|
||||||
}
|
}
|
||||||
if (!cachedBlock.isPresent(datanode.getCached())) {
|
// Add the block to the datanode's implicit cached block list
|
||||||
datanode.getCached().add(cachedBlock);
|
// if it's not already there. Similarly, remove it from the pending
|
||||||
|
// cached block list if it exists there.
|
||||||
|
if (!cachedBlock.isPresent(cachedList)) {
|
||||||
|
cachedList.add(cachedBlock);
|
||||||
}
|
}
|
||||||
if (cachedBlock.isPresent(datanode.getPendingCached())) {
|
if (cachedBlock.isPresent(pendingCachedList)) {
|
||||||
datanode.getPendingCached().remove(cachedBlock);
|
pendingCachedList.remove(cachedBlock);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -69,6 +69,7 @@ import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo.Expiration;
|
||||||
import org.apache.hadoop.hdfs.protocol.CachePoolStats;
|
import org.apache.hadoop.hdfs.protocol.CachePoolStats;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
|
import org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor;
|
||||||
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.CachedBlocksList.Type;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
|
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
|
||||||
import org.apache.hadoop.io.nativeio.NativeIO;
|
import org.apache.hadoop.io.nativeio.NativeIO;
|
||||||
import org.apache.hadoop.io.nativeio.NativeIO.POSIX.CacheManipulator;
|
import org.apache.hadoop.io.nativeio.NativeIO.POSIX.CacheManipulator;
|
||||||
|
@ -796,7 +797,15 @@ public class TestCacheDirectives {
|
||||||
}
|
}
|
||||||
}, 500, 60000);
|
}, 500, 60000);
|
||||||
|
|
||||||
|
// Send a cache report referring to a bogus block. It is important that
|
||||||
|
// the NameNode be robust against this.
|
||||||
NamenodeProtocols nnRpc = namenode.getRpcServer();
|
NamenodeProtocols nnRpc = namenode.getRpcServer();
|
||||||
|
DataNode dn0 = cluster.getDataNodes().get(0);
|
||||||
|
String bpid = cluster.getNamesystem().getBlockPoolId();
|
||||||
|
LinkedList<Long> bogusBlockIds = new LinkedList<Long> ();
|
||||||
|
bogusBlockIds.add(999999L);
|
||||||
|
nnRpc.cacheReport(dn0.getDNRegistrationForBP(bpid), bpid, bogusBlockIds);
|
||||||
|
|
||||||
Path rootDir = helper.getDefaultWorkingDirectory(dfs);
|
Path rootDir = helper.getDefaultWorkingDirectory(dfs);
|
||||||
// Create the pool
|
// Create the pool
|
||||||
final String pool = "friendlyPool";
|
final String pool = "friendlyPool";
|
||||||
|
|
Loading…
Reference in New Issue