HDFS-1972. Fencing mechanism for block invalidations and replications. Contributed by Todd Lipcon.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1221608 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
36d1c49486
commit
31c91706f7
|
@ -73,3 +73,5 @@ HDFS-2678. When a FailoverProxyProvider is used, DFSClient should not retry conn
|
||||||
HDFS-2682. When a FailoverProxyProvider is used, Client should not retry for 45 times if it is timing out to connect to server. (Uma Maheswara Rao G via todd)
|
HDFS-2682. When a FailoverProxyProvider is used, Client should not retry for 45 times if it is timing out to connect to server. (Uma Maheswara Rao G via todd)
|
||||||
|
|
||||||
HDFS-2693. Fix synchronization issues around state transition (todd)
|
HDFS-2693. Fix synchronization issues around state transition (todd)
|
||||||
|
|
||||||
|
HDFS-1972. Fencing mechanism for block invalidations and replications (todd)
|
||||||
|
|
|
@ -28,6 +28,7 @@ import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
@ -68,6 +69,8 @@ import org.apache.hadoop.net.Node;
|
||||||
import org.apache.hadoop.util.Daemon;
|
import org.apache.hadoop.util.Daemon;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Keeps information related to the blocks stored in the Hadoop cluster.
|
* Keeps information related to the blocks stored in the Hadoop cluster.
|
||||||
|
@ -91,6 +94,7 @@ public class BlockManager {
|
||||||
private volatile long underReplicatedBlocksCount = 0L;
|
private volatile long underReplicatedBlocksCount = 0L;
|
||||||
private volatile long scheduledReplicationBlocksCount = 0L;
|
private volatile long scheduledReplicationBlocksCount = 0L;
|
||||||
private volatile long excessBlocksCount = 0L;
|
private volatile long excessBlocksCount = 0L;
|
||||||
|
private volatile long postponedMisreplicatedBlocksCount = 0L;
|
||||||
|
|
||||||
/** Used by metrics */
|
/** Used by metrics */
|
||||||
public long getPendingReplicationBlocksCount() {
|
public long getPendingReplicationBlocksCount() {
|
||||||
|
@ -116,6 +120,10 @@ public class BlockManager {
|
||||||
public long getExcessBlocksCount() {
|
public long getExcessBlocksCount() {
|
||||||
return excessBlocksCount;
|
return excessBlocksCount;
|
||||||
}
|
}
|
||||||
|
/** Used by metrics */
|
||||||
|
public long getPostponedMisreplicatedBlocksCount() {
|
||||||
|
return postponedMisreplicatedBlocksCount;
|
||||||
|
}
|
||||||
|
|
||||||
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
/**replicationRecheckInterval is how often namenode checks for new replication work*/
|
||||||
private final long replicationRecheckInterval;
|
private final long replicationRecheckInterval;
|
||||||
|
@ -135,6 +143,15 @@ public class BlockManager {
|
||||||
/** Blocks to be invalidated. */
|
/** Blocks to be invalidated. */
|
||||||
private final InvalidateBlocks invalidateBlocks;
|
private final InvalidateBlocks invalidateBlocks;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* After a failover, over-replicated blocks may not be handled
|
||||||
|
* until all of the replicas have done a block report to the
|
||||||
|
* new active. This is to make sure that this NameNode has been
|
||||||
|
* notified of all block deletions that might have been pending
|
||||||
|
* when the failover happened.
|
||||||
|
*/
|
||||||
|
private final Set<Block> postponedMisreplicatedBlocks = Sets.newHashSet();
|
||||||
|
|
||||||
//
|
//
|
||||||
// Keeps a TreeSet for every named node. Each treeset contains
|
// Keeps a TreeSet for every named node. Each treeset contains
|
||||||
// a list of the blocks that are "extra" at that location. We'll
|
// a list of the blocks that are "extra" at that location. We'll
|
||||||
|
@ -316,6 +333,31 @@ public class BlockManager {
|
||||||
out.println("Metasave: Blocks waiting for replication: " +
|
out.println("Metasave: Blocks waiting for replication: " +
|
||||||
neededReplications.size());
|
neededReplications.size());
|
||||||
for (Block block : neededReplications) {
|
for (Block block : neededReplications) {
|
||||||
|
dumpBlockMeta(block, out);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump any postponed over-replicated blocks
|
||||||
|
out.println("Mis-replicated blocks that have been postponed:");
|
||||||
|
for (Block block : postponedMisreplicatedBlocks) {
|
||||||
|
dumpBlockMeta(block, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump blocks from pendingReplication
|
||||||
|
pendingReplications.metaSave(out);
|
||||||
|
|
||||||
|
// Dump blocks that are waiting to be deleted
|
||||||
|
invalidateBlocks.dump(out);
|
||||||
|
|
||||||
|
// Dump all datanodes
|
||||||
|
getDatanodeManager().datanodeDump(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dump the metadata for the given block in a human-readable
|
||||||
|
* form.
|
||||||
|
*/
|
||||||
|
private void dumpBlockMeta(Block block, PrintWriter out) {
|
||||||
List<DatanodeDescriptor> containingNodes =
|
List<DatanodeDescriptor> containingNodes =
|
||||||
new ArrayList<DatanodeDescriptor>();
|
new ArrayList<DatanodeDescriptor>();
|
||||||
List<DatanodeDescriptor> containingLiveReplicasNodes =
|
List<DatanodeDescriptor> containingLiveReplicasNodes =
|
||||||
|
@ -354,21 +396,14 @@ public class BlockManager {
|
||||||
node.isDecommissionInProgress()) {
|
node.isDecommissionInProgress()) {
|
||||||
state = "(decommissioned)";
|
state = "(decommissioned)";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (node.areBlockContentsStale()) {
|
||||||
|
state += " (block deletions maybe out of date)";
|
||||||
|
}
|
||||||
out.print(" " + node + state + " : ");
|
out.print(" " + node + state + " : ");
|
||||||
}
|
}
|
||||||
out.println("");
|
out.println("");
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Dump blocks from pendingReplication
|
|
||||||
pendingReplications.metaSave(out);
|
|
||||||
|
|
||||||
// Dump blocks that are waiting to be deleted
|
|
||||||
invalidateBlocks.dump(out);
|
|
||||||
|
|
||||||
// Dump all datanodes
|
|
||||||
getDatanodeManager().datanodeDump(out);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** @return maxReplicationStreams */
|
/** @return maxReplicationStreams */
|
||||||
public int getMaxReplicationStreams() {
|
public int getMaxReplicationStreams() {
|
||||||
|
@ -782,6 +817,14 @@ public class BlockManager {
|
||||||
|
|
||||||
node.resetBlocks();
|
node.resetBlocks();
|
||||||
invalidateBlocks.remove(node.getStorageID());
|
invalidateBlocks.remove(node.getStorageID());
|
||||||
|
|
||||||
|
// If the DN hasn't block-reported since the most recent
|
||||||
|
// failover, then we may have been holding up on processing
|
||||||
|
// over-replicated blocks because of it. But we can now
|
||||||
|
// process those blocks.
|
||||||
|
if (node.areBlockContentsStale()) {
|
||||||
|
rescanPostponedMisreplicatedBlocks();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -879,10 +922,17 @@ public class BlockManager {
|
||||||
+ " because datanode " + dn.getName() + " does not exist.");
|
+ " because datanode " + dn.getName() + " does not exist.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check how many copies we have of the block. If we have at least one
|
// Check how many copies we have of the block
|
||||||
// copy on a live node, then we can delete it.
|
NumberReplicas nr = countNodes(blk);
|
||||||
int count = countNodes(blk).liveReplicas();
|
if (nr.replicasOnStaleNodes() > 0) {
|
||||||
if (count >= 1) {
|
NameNode.stateChangeLog.info("BLOCK* invalidateBlocks: postponing " +
|
||||||
|
"invalidation of block " + blk + " on " + dn + " because " +
|
||||||
|
nr.replicasOnStaleNodes() + " replica(s) are located on nodes " +
|
||||||
|
"with potentially out-of-date block reports.");
|
||||||
|
postponeBlock(blk);
|
||||||
|
|
||||||
|
} else if (nr.liveReplicas() >= 1) {
|
||||||
|
// If we have at least one copy on a live node, then we can delete it.
|
||||||
addToInvalidates(blk, dn);
|
addToInvalidates(blk, dn);
|
||||||
removeStoredBlock(blk, node);
|
removeStoredBlock(blk, node);
|
||||||
if(NameNode.stateChangeLog.isDebugEnabled()) {
|
if(NameNode.stateChangeLog.isDebugEnabled()) {
|
||||||
|
@ -895,6 +945,13 @@ public class BlockManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void postponeBlock(Block blk) {
|
||||||
|
if (postponedMisreplicatedBlocks.add(blk)) {
|
||||||
|
postponedMisreplicatedBlocksCount++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void updateState() {
|
void updateState() {
|
||||||
pendingReplicationBlocksCount = pendingReplications.size();
|
pendingReplicationBlocksCount = pendingReplications.size();
|
||||||
underReplicatedBlocksCount = neededReplications.size();
|
underReplicatedBlocksCount = neededReplications.size();
|
||||||
|
@ -933,7 +990,7 @@ public class BlockManager {
|
||||||
*
|
*
|
||||||
* @return number of blocks scheduled for replication during this iteration.
|
* @return number of blocks scheduled for replication during this iteration.
|
||||||
*/
|
*/
|
||||||
private int computeReplicationWork(int blocksToProcess) throws IOException {
|
int computeReplicationWork(int blocksToProcess) throws IOException {
|
||||||
List<List<Block>> blocksToReplicate = null;
|
List<List<Block>> blocksToReplicate = null;
|
||||||
namesystem.writeLock();
|
namesystem.writeLock();
|
||||||
try {
|
try {
|
||||||
|
@ -984,8 +1041,10 @@ public class BlockManager {
|
||||||
NumberReplicas numReplicas = new NumberReplicas();
|
NumberReplicas numReplicas = new NumberReplicas();
|
||||||
srcNode = chooseSourceDatanode(
|
srcNode = chooseSourceDatanode(
|
||||||
block, containingNodes, liveReplicaNodes, numReplicas);
|
block, containingNodes, liveReplicaNodes, numReplicas);
|
||||||
if(srcNode == null) // block can not be replicated from any node
|
if(srcNode == null) { // block can not be replicated from any node
|
||||||
|
LOG.debug("Block " + block + " cannot be repl from any node");
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
assert liveReplicaNodes.size() == numReplicas.liveReplicas();
|
assert liveReplicaNodes.size() == numReplicas.liveReplicas();
|
||||||
// do not schedule more if enough replicas is already pending
|
// do not schedule more if enough replicas is already pending
|
||||||
|
@ -1235,7 +1294,7 @@ public class BlockManager {
|
||||||
srcNode = node;
|
srcNode = node;
|
||||||
}
|
}
|
||||||
if(numReplicas != null)
|
if(numReplicas != null)
|
||||||
numReplicas.initialize(live, decommissioned, corrupt, excess);
|
numReplicas.initialize(live, decommissioned, corrupt, excess, 0);
|
||||||
return srcNode;
|
return srcNode;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1316,6 +1375,19 @@ public class BlockManager {
|
||||||
} else {
|
} else {
|
||||||
processReport(node, newReport);
|
processReport(node, newReport);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Now that we have an up-to-date block report, we know that any
|
||||||
|
// deletions from a previous NN iteration have been accounted for.
|
||||||
|
boolean staleBefore = node.areBlockContentsStale();
|
||||||
|
node.receivedBlockReport();
|
||||||
|
if (staleBefore && !node.areBlockContentsStale()) {
|
||||||
|
LOG.info("BLOCK* processReport: " +
|
||||||
|
"Received first block report from " + node +
|
||||||
|
" after becoming active. Its block contents are no longer" +
|
||||||
|
" considered stale.");
|
||||||
|
rescanPostponedMisreplicatedBlocks();
|
||||||
|
}
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
endTime = Util.now();
|
endTime = Util.now();
|
||||||
namesystem.writeUnlock();
|
namesystem.writeUnlock();
|
||||||
|
@ -1328,6 +1400,37 @@ public class BlockManager {
|
||||||
+ ", processing time: " + (endTime - startTime) + " msecs");
|
+ ", processing time: " + (endTime - startTime) + " msecs");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Rescan the list of blocks which were previously postponed.
|
||||||
|
*/
|
||||||
|
private void rescanPostponedMisreplicatedBlocks() {
|
||||||
|
for (Iterator<Block> it = postponedMisreplicatedBlocks.iterator();
|
||||||
|
it.hasNext();) {
|
||||||
|
Block b = it.next();
|
||||||
|
|
||||||
|
BlockInfo bi = blocksMap.getStoredBlock(b);
|
||||||
|
if (bi == null) {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
|
||||||
|
"Postponed mis-replicated block " + b + " no longer found " +
|
||||||
|
"in block map.");
|
||||||
|
}
|
||||||
|
it.remove();
|
||||||
|
postponedMisreplicatedBlocksCount--;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
MisReplicationResult res = processMisReplicatedBlock(bi);
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("BLOCK* rescanPostponedMisreplicatedBlocks: " +
|
||||||
|
"Re-scanned block " + b + ", result is " + res);
|
||||||
|
}
|
||||||
|
if (res != MisReplicationResult.POSTPONE) {
|
||||||
|
it.remove();
|
||||||
|
postponedMisreplicatedBlocksCount--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void processReport(final DatanodeDescriptor node,
|
private void processReport(final DatanodeDescriptor node,
|
||||||
final BlockListAsLongs report) throws IOException {
|
final BlockListAsLongs report) throws IOException {
|
||||||
// Normal case:
|
// Normal case:
|
||||||
|
@ -1505,8 +1608,9 @@ public class BlockManager {
|
||||||
|
|
||||||
// Ignore replicas already scheduled to be removed from the DN
|
// Ignore replicas already scheduled to be removed from the DN
|
||||||
if(invalidateBlocks.contains(dn.getStorageID(), block)) {
|
if(invalidateBlocks.contains(dn.getStorageID(), block)) {
|
||||||
assert storedBlock.findDatanode(dn) < 0 : "Block " + block
|
/* TODO: following assertion is incorrect, see HDFS-2668
|
||||||
+ " in recentInvalidatesSet should not appear in DN " + dn;
|
assert storedBlock.findDatanode(dn) < 0 : "Block " + block
|
||||||
|
+ " in recentInvalidatesSet should not appear in DN " + dn; */
|
||||||
return storedBlock;
|
return storedBlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1773,15 +1877,50 @@ public class BlockManager {
|
||||||
public void processMisReplicatedBlocks() {
|
public void processMisReplicatedBlocks() {
|
||||||
assert namesystem.hasWriteLock();
|
assert namesystem.hasWriteLock();
|
||||||
|
|
||||||
long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0;
|
long nrInvalid = 0, nrOverReplicated = 0, nrUnderReplicated = 0, nrPostponed = 0;
|
||||||
neededReplications.clear();
|
neededReplications.clear();
|
||||||
for (BlockInfo block : blocksMap.getBlocks()) {
|
for (BlockInfo block : blocksMap.getBlocks()) {
|
||||||
|
MisReplicationResult res = processMisReplicatedBlock(block);
|
||||||
|
LOG.info("block " + block + ": " + res);
|
||||||
|
switch (res) {
|
||||||
|
case UNDER_REPLICATED:
|
||||||
|
nrUnderReplicated++;
|
||||||
|
break;
|
||||||
|
case OVER_REPLICATED:
|
||||||
|
nrOverReplicated++;
|
||||||
|
break;
|
||||||
|
case INVALID:
|
||||||
|
nrInvalid++;
|
||||||
|
break;
|
||||||
|
case POSTPONE:
|
||||||
|
nrPostponed++;
|
||||||
|
postponeBlock(block);
|
||||||
|
break;
|
||||||
|
case OK:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new AssertionError("Invalid enum value: " + res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LOG.info("Total number of blocks = " + blocksMap.size());
|
||||||
|
LOG.info("Number of invalid blocks = " + nrInvalid);
|
||||||
|
LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
|
||||||
|
LOG.info("Number of over-replicated blocks = " + nrOverReplicated +
|
||||||
|
((nrPostponed > 0) ? ( " (" + nrPostponed + " postponed)") : ""));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process a single possibly misreplicated block. This adds it to the
|
||||||
|
* appropriate queues if necessary, and returns a result code indicating
|
||||||
|
* what happened with it.
|
||||||
|
*/
|
||||||
|
private MisReplicationResult processMisReplicatedBlock(BlockInfo block) {
|
||||||
INodeFile fileINode = block.getINode();
|
INodeFile fileINode = block.getINode();
|
||||||
if (fileINode == null) {
|
if (fileINode == null) {
|
||||||
// block does not belong to any file
|
// block does not belong to any file
|
||||||
nrInvalid++;
|
|
||||||
addToInvalidates(block);
|
addToInvalidates(block);
|
||||||
continue;
|
return MisReplicationResult.INVALID;
|
||||||
}
|
}
|
||||||
// calculate current replication
|
// calculate current replication
|
||||||
short expectedReplication = fileINode.getReplication();
|
short expectedReplication = fileINode.getReplication();
|
||||||
|
@ -1791,21 +1930,26 @@ public class BlockManager {
|
||||||
if (isNeededReplication(block, expectedReplication, numCurrentReplica)) {
|
if (isNeededReplication(block, expectedReplication, numCurrentReplica)) {
|
||||||
if (neededReplications.add(block, numCurrentReplica, num
|
if (neededReplications.add(block, numCurrentReplica, num
|
||||||
.decommissionedReplicas(), expectedReplication)) {
|
.decommissionedReplicas(), expectedReplication)) {
|
||||||
nrUnderReplicated++;
|
return MisReplicationResult.UNDER_REPLICATED;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (numCurrentReplica > expectedReplication) {
|
if (numCurrentReplica > expectedReplication) {
|
||||||
// over-replicated block
|
if (num.replicasOnStaleNodes() > 0) {
|
||||||
nrOverReplicated++;
|
// If any of the replicas of this block are on nodes that are
|
||||||
processOverReplicatedBlock(block, expectedReplication, null, null);
|
// considered "stale", then these replicas may in fact have
|
||||||
}
|
// already been deleted. So, we cannot safely act on the
|
||||||
|
// over-replication until a later point in time, when
|
||||||
|
// the "stale" nodes have block reported.
|
||||||
|
return MisReplicationResult.POSTPONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG.info("Total number of blocks = " + blocksMap.size());
|
// over-replicated block
|
||||||
LOG.info("Number of invalid blocks = " + nrInvalid);
|
processOverReplicatedBlock(block, expectedReplication, null, null);
|
||||||
LOG.info("Number of under-replicated blocks = " + nrUnderReplicated);
|
return MisReplicationResult.OVER_REPLICATED;
|
||||||
LOG.info("Number of over-replicated blocks = " + nrOverReplicated);
|
}
|
||||||
|
|
||||||
|
return MisReplicationResult.OK;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Set replication for the blocks. */
|
/** Set replication for the blocks. */
|
||||||
|
@ -1851,6 +1995,14 @@ public class BlockManager {
|
||||||
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
|
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
|
||||||
it.hasNext();) {
|
it.hasNext();) {
|
||||||
DatanodeDescriptor cur = it.next();
|
DatanodeDescriptor cur = it.next();
|
||||||
|
if (cur.areBlockContentsStale()) {
|
||||||
|
LOG.info("BLOCK* processOverReplicatedBlock: " +
|
||||||
|
"Postponing processing of over-replicated block " +
|
||||||
|
block + " since datanode " + cur + " does not yet have up-to-date " +
|
||||||
|
"block information.");
|
||||||
|
postponeBlock(block);
|
||||||
|
return;
|
||||||
|
}
|
||||||
LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(cur
|
LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(cur
|
||||||
.getStorageID());
|
.getStorageID());
|
||||||
if (excessBlocks == null || !excessBlocks.contains(block)) {
|
if (excessBlocks == null || !excessBlocks.contains(block)) {
|
||||||
|
@ -2153,13 +2305,15 @@ public class BlockManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return the number of nodes that are live and decommissioned.
|
* Return the number of nodes hosting a given block, grouped
|
||||||
|
* by the state of those replicas.
|
||||||
*/
|
*/
|
||||||
public NumberReplicas countNodes(Block b) {
|
public NumberReplicas countNodes(Block b) {
|
||||||
int count = 0;
|
int decommissioned = 0;
|
||||||
int live = 0;
|
int live = 0;
|
||||||
int corrupt = 0;
|
int corrupt = 0;
|
||||||
int excess = 0;
|
int excess = 0;
|
||||||
|
int stale = 0;
|
||||||
Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(b);
|
Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(b);
|
||||||
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b);
|
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b);
|
||||||
while (nodeIter.hasNext()) {
|
while (nodeIter.hasNext()) {
|
||||||
|
@ -2167,7 +2321,7 @@ public class BlockManager {
|
||||||
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) {
|
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) {
|
||||||
corrupt++;
|
corrupt++;
|
||||||
} else if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
} else if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
||||||
count++;
|
decommissioned++;
|
||||||
} else {
|
} else {
|
||||||
LightWeightLinkedSet<Block> blocksExcess = excessReplicateMap.get(node
|
LightWeightLinkedSet<Block> blocksExcess = excessReplicateMap.get(node
|
||||||
.getStorageID());
|
.getStorageID());
|
||||||
|
@ -2177,8 +2331,11 @@ public class BlockManager {
|
||||||
live++;
|
live++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (node.areBlockContentsStale()) {
|
||||||
|
stale++;
|
||||||
}
|
}
|
||||||
return new NumberReplicas(live, count, corrupt, excess);
|
}
|
||||||
|
return new NumberReplicas(live, decommissioned, corrupt, excess, stale);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -2323,10 +2480,14 @@ public class BlockManager {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void removeBlock(Block block) {
|
public void removeBlock(Block block) {
|
||||||
|
assert namesystem.hasWriteLock();
|
||||||
block.setNumBytes(BlockCommand.NO_ACK);
|
block.setNumBytes(BlockCommand.NO_ACK);
|
||||||
addToInvalidates(block);
|
addToInvalidates(block);
|
||||||
corruptReplicas.removeFromCorruptReplicasMap(block);
|
corruptReplicas.removeFromCorruptReplicasMap(block);
|
||||||
blocksMap.removeBlock(block);
|
blocksMap.removeBlock(block);
|
||||||
|
if (postponedMisreplicatedBlocks.remove(block)) {
|
||||||
|
postponedMisreplicatedBlocksCount--;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public BlockInfo getStoredBlock(Block block) {
|
public BlockInfo getStoredBlock(Block block) {
|
||||||
|
@ -2387,8 +2548,10 @@ public class BlockManager {
|
||||||
namesystem.writeLock();
|
namesystem.writeLock();
|
||||||
try {
|
try {
|
||||||
// blocks should not be replicated or removed if safe mode is on
|
// blocks should not be replicated or removed if safe mode is on
|
||||||
if (namesystem.isInSafeMode())
|
if (namesystem.isInSafeMode()) {
|
||||||
|
LOG.debug("In safemode, not computing replication work");
|
||||||
return 0;
|
return 0;
|
||||||
|
}
|
||||||
// get blocks to invalidate for the nodeId
|
// get blocks to invalidate for the nodeId
|
||||||
assert nodeId != null;
|
assert nodeId != null;
|
||||||
return invalidateBlocks.invalidateWork(nodeId);
|
return invalidateBlocks.invalidateWork(nodeId);
|
||||||
|
@ -2571,6 +2734,19 @@ public class BlockManager {
|
||||||
return workFound;
|
return workFound;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear all queues that hold decisions previously made by
|
||||||
|
* this NameNode.
|
||||||
|
*/
|
||||||
|
public void clearQueues() {
|
||||||
|
neededReplications.clear();
|
||||||
|
pendingReplications.clear();
|
||||||
|
excessReplicateMap.clear();
|
||||||
|
invalidateBlocks.clear();
|
||||||
|
datanodeManager.clearPendingQueues();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
private static class ReplicationWork {
|
private static class ReplicationWork {
|
||||||
|
|
||||||
private Block block;
|
private Block block;
|
||||||
|
@ -2601,4 +2777,22 @@ public class BlockManager {
|
||||||
this.targets = null;
|
this.targets = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A simple result enum for the result of
|
||||||
|
* {@link BlockManager#processMisReplicatedBlock(BlockInfo)}.
|
||||||
|
*/
|
||||||
|
enum MisReplicationResult {
|
||||||
|
/** The block should be invalidated since it belongs to a deleted file. */
|
||||||
|
INVALID,
|
||||||
|
/** The block is currently under-replicated. */
|
||||||
|
UNDER_REPLICATED,
|
||||||
|
/** The block is currently over-replicated. */
|
||||||
|
OVER_REPLICATED,
|
||||||
|
/** A decision can't currently be made about this block. */
|
||||||
|
POSTPONE,
|
||||||
|
/** The block is properly replicated */
|
||||||
|
OK
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
|
||||||
initialize(conf, stats, clusterMap);
|
initialize(conf, stats, clusterMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
BlockPlacementPolicyDefault() {
|
protected BlockPlacementPolicyDefault() {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -94,6 +94,10 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
boolean contains(E e) {
|
boolean contains(E e) {
|
||||||
return blockq.contains(e);
|
return blockq.contains(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synchronized void clear() {
|
||||||
|
blockq.clear();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private volatile BlockInfo blockList = null;
|
private volatile BlockInfo blockList = null;
|
||||||
|
@ -103,6 +107,24 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
public boolean isAlive = false;
|
public boolean isAlive = false;
|
||||||
public boolean needKeyUpdate = false;
|
public boolean needKeyUpdate = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set to false on any NN failover, and reset to true
|
||||||
|
* whenever a block report is received.
|
||||||
|
*/
|
||||||
|
private boolean heartbeatedSinceFailover = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* At startup or at any failover, the DNs in the cluster may
|
||||||
|
* have pending block deletions from a previous incarnation
|
||||||
|
* of the NameNode. Thus, we consider their block contents
|
||||||
|
* stale until we have received a block report. When a DN
|
||||||
|
* is considered stale, any replicas on it are transitively
|
||||||
|
* considered stale. If any block has at least one stale replica,
|
||||||
|
* then no invalidations will be processed for this block.
|
||||||
|
* See HDFS-1972.
|
||||||
|
*/
|
||||||
|
private boolean blockContentsStale = true;
|
||||||
|
|
||||||
// A system administrator can tune the balancer bandwidth parameter
|
// A system administrator can tune the balancer bandwidth parameter
|
||||||
// (dfs.balance.bandwidthPerSec) dynamically by calling
|
// (dfs.balance.bandwidthPerSec) dynamically by calling
|
||||||
// "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
|
// "dfsadmin -setBalanacerBandwidth <newbandwidth>", at which point the
|
||||||
|
@ -282,6 +304,14 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
this.volumeFailures = 0;
|
this.volumeFailures = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void clearBlockQueues() {
|
||||||
|
synchronized (invalidateBlocks) {
|
||||||
|
this.invalidateBlocks.clear();
|
||||||
|
this.recoverBlocks.clear();
|
||||||
|
this.replicateBlocks.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public int numBlocks() {
|
public int numBlocks() {
|
||||||
return numBlocks;
|
return numBlocks;
|
||||||
}
|
}
|
||||||
|
@ -298,6 +328,7 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
this.lastUpdate = System.currentTimeMillis();
|
this.lastUpdate = System.currentTimeMillis();
|
||||||
this.xceiverCount = xceiverCount;
|
this.xceiverCount = xceiverCount;
|
||||||
this.volumeFailures = volFailures;
|
this.volumeFailures = volFailures;
|
||||||
|
this.heartbeatedSinceFailover = true;
|
||||||
rollBlocksScheduled(lastUpdate);
|
rollBlocksScheduled(lastUpdate);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -564,5 +595,36 @@ public class DatanodeDescriptor extends DatanodeInfo {
|
||||||
this.bandwidth = bandwidth;
|
this.bandwidth = bandwidth;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean areBlockContentsStale() {
|
||||||
|
return blockContentsStale;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void markStaleAfterFailover() {
|
||||||
|
heartbeatedSinceFailover = false;
|
||||||
|
blockContentsStale = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void receivedBlockReport() {
|
||||||
|
if (heartbeatedSinceFailover) {
|
||||||
|
blockContentsStale = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String dumpDatanode() {
|
||||||
|
StringBuilder sb = new StringBuilder(super.dumpDatanode());
|
||||||
|
int repl = replicateBlocks.size();
|
||||||
|
if (repl > 0) {
|
||||||
|
sb.append(" ").append(repl).append(" blocks to be replicated;");
|
||||||
|
}
|
||||||
|
int inval = invalidateBlocks.size();
|
||||||
|
if (inval > 0) {
|
||||||
|
sb.append(" ").append(inval).append(" blocks to be invalidated;");
|
||||||
|
}
|
||||||
|
int recover = recoverBlocks.size();
|
||||||
|
if (recover > 0) {
|
||||||
|
sb.append(" ").append(recover).append(" blocks to be recovered;");
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -945,4 +945,27 @@ public class DatanodeManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void markAllDatanodesStale() {
|
||||||
|
LOG.info("Marking all datandoes as stale");
|
||||||
|
synchronized (datanodeMap) {
|
||||||
|
for (DatanodeDescriptor dn : datanodeMap.values()) {
|
||||||
|
dn.markStaleAfterFailover();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear any actions that are queued up to be sent to the DNs
|
||||||
|
* on their next heartbeats. This includes block invalidations,
|
||||||
|
* recoveries, and replication requests.
|
||||||
|
*/
|
||||||
|
public void clearPendingQueues() {
|
||||||
|
synchronized (datanodeMap) {
|
||||||
|
for (DatanodeDescriptor dn : datanodeMap.values()) {
|
||||||
|
dn.clearBlockQueues();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -160,4 +160,9 @@ class InvalidateBlocks {
|
||||||
numBlocks -= toInvalidate.size();
|
numBlocks -= toInvalidate.size();
|
||||||
return toInvalidate;
|
return toInvalidate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synchronized void clear() {
|
||||||
|
node2blocks.clear();
|
||||||
|
numBlocks = 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,20 +26,22 @@ public class NumberReplicas {
|
||||||
private int decommissionedReplicas;
|
private int decommissionedReplicas;
|
||||||
private int corruptReplicas;
|
private int corruptReplicas;
|
||||||
private int excessReplicas;
|
private int excessReplicas;
|
||||||
|
private int replicasOnStaleNodes;
|
||||||
|
|
||||||
NumberReplicas() {
|
NumberReplicas() {
|
||||||
initialize(0, 0, 0, 0);
|
initialize(0, 0, 0, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
NumberReplicas(int live, int decommissioned, int corrupt, int excess) {
|
NumberReplicas(int live, int decommissioned, int corrupt, int excess, int stale) {
|
||||||
initialize(live, decommissioned, corrupt, excess);
|
initialize(live, decommissioned, corrupt, excess, stale);
|
||||||
}
|
}
|
||||||
|
|
||||||
void initialize(int live, int decommissioned, int corrupt, int excess) {
|
void initialize(int live, int decommissioned, int corrupt, int excess, int stale) {
|
||||||
liveReplicas = live;
|
liveReplicas = live;
|
||||||
decommissionedReplicas = decommissioned;
|
decommissionedReplicas = decommissioned;
|
||||||
corruptReplicas = corrupt;
|
corruptReplicas = corrupt;
|
||||||
excessReplicas = excess;
|
excessReplicas = excess;
|
||||||
|
replicasOnStaleNodes = stale;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int liveReplicas() {
|
public int liveReplicas() {
|
||||||
|
@ -54,4 +56,13 @@ public class NumberReplicas {
|
||||||
public int excessReplicas() {
|
public int excessReplicas() {
|
||||||
return excessReplicas;
|
return excessReplicas;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the number of replicas which are on stale nodes.
|
||||||
|
* This is not mutually exclusive with the other counts -- ie a
|
||||||
|
* replica may count as both "live" and "stale".
|
||||||
|
*/
|
||||||
|
public int replicasOnStaleNodes() {
|
||||||
|
return replicasOnStaleNodes;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -104,6 +104,14 @@ class PendingReplicationBlocks {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
synchronized (pendingReplications) {
|
||||||
|
pendingReplications.clear();
|
||||||
|
timedOutItems.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The total number of blocks that are undergoing replication
|
* The total number of blocks that are undergoing replication
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -498,6 +498,16 @@ class BPOfferService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run an immediate deletion report on this thread. Used by tests.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
void triggerDeletionReportForTests() throws IOException {
|
||||||
|
for (BPServiceActor actor : bpServices) {
|
||||||
|
actor.triggerDeletionReportForTests();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Run an immediate heartbeat from all actors. Used by tests.
|
* Run an immediate heartbeat from all actors. Used by tests.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -281,8 +281,18 @@ class BPServiceActor implements Runnable {
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
void triggerBlockReportForTests() throws IOException {
|
void triggerBlockReportForTests() throws IOException {
|
||||||
|
synchronized (receivedAndDeletedBlockList) {
|
||||||
lastBlockReport = 0;
|
lastBlockReport = 0;
|
||||||
blockReport();
|
lastHeartbeat = 0;
|
||||||
|
receivedAndDeletedBlockList.notifyAll();
|
||||||
|
while (lastBlockReport == 0) {
|
||||||
|
try {
|
||||||
|
receivedAndDeletedBlockList.wait(100);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
|
@ -290,6 +300,29 @@ class BPServiceActor implements Runnable {
|
||||||
synchronized (receivedAndDeletedBlockList) {
|
synchronized (receivedAndDeletedBlockList) {
|
||||||
lastHeartbeat = 0;
|
lastHeartbeat = 0;
|
||||||
receivedAndDeletedBlockList.notifyAll();
|
receivedAndDeletedBlockList.notifyAll();
|
||||||
|
while (lastHeartbeat == 0) {
|
||||||
|
try {
|
||||||
|
receivedAndDeletedBlockList.wait(100);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@VisibleForTesting
|
||||||
|
void triggerDeletionReportForTests() throws IOException {
|
||||||
|
synchronized (receivedAndDeletedBlockList) {
|
||||||
|
lastDeletedReport = 0;
|
||||||
|
receivedAndDeletedBlockList.notifyAll();
|
||||||
|
|
||||||
|
while (lastDeletedReport == 0) {
|
||||||
|
try {
|
||||||
|
receivedAndDeletedBlockList.wait(100);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -107,6 +107,14 @@ class FSDatasetAsyncDiskService {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
synchronized long countPendingDeletions() {
|
||||||
|
long count = 0;
|
||||||
|
for (ThreadPoolExecutor exec : executors.values()) {
|
||||||
|
count += exec.getTaskCount() - exec.getCompletedTaskCount();
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Execute the task sometime in the future, using ThreadPools.
|
* Execute the task sometime in the future, using ThreadPools.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -71,6 +71,7 @@ import java.io.FileNotFoundException;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.PrintWriter;
|
import java.io.PrintWriter;
|
||||||
|
import java.io.StringWriter;
|
||||||
import java.lang.management.ManagementFactory;
|
import java.lang.management.ManagementFactory;
|
||||||
import java.net.InetAddress;
|
import java.net.InetAddress;
|
||||||
import java.net.URI;
|
import java.net.URI;
|
||||||
|
@ -508,6 +509,17 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
"taking over writer role in edits logs.");
|
"taking over writer role in edits logs.");
|
||||||
editLogTailer.catchupDuringFailover();
|
editLogTailer.catchupDuringFailover();
|
||||||
|
|
||||||
|
LOG.info("Reprocessing replication and invalidation queues...");
|
||||||
|
blockManager.getDatanodeManager().markAllDatanodesStale();
|
||||||
|
blockManager.clearQueues();
|
||||||
|
blockManager.processMisReplicatedBlocks();
|
||||||
|
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug("NameNode metadata after re-processing " +
|
||||||
|
"replication and invalidation queues during failover:\n" +
|
||||||
|
metaSaveAsString());
|
||||||
|
}
|
||||||
|
|
||||||
long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
|
long nextTxId = dir.fsImage.getLastAppliedTxId() + 1;
|
||||||
LOG.info("Will take over writing edit logs at txnid " +
|
LOG.info("Will take over writing edit logs at txnid " +
|
||||||
nextTxId);
|
nextTxId);
|
||||||
|
@ -781,14 +793,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
File file = new File(System.getProperty("hadoop.log.dir"), filename);
|
File file = new File(System.getProperty("hadoop.log.dir"), filename);
|
||||||
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file,
|
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file,
|
||||||
true)));
|
true)));
|
||||||
|
metaSave(out);
|
||||||
long totalInodes = this.dir.totalInodes();
|
|
||||||
long totalBlocks = this.getBlocksTotal();
|
|
||||||
out.println(totalInodes + " files and directories, " + totalBlocks
|
|
||||||
+ " blocks = " + (totalInodes + totalBlocks) + " total");
|
|
||||||
|
|
||||||
blockManager.metaSave(out);
|
|
||||||
|
|
||||||
out.flush();
|
out.flush();
|
||||||
out.close();
|
out.close();
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -796,6 +801,25 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void metaSave(PrintWriter out) {
|
||||||
|
assert hasWriteLock();
|
||||||
|
long totalInodes = this.dir.totalInodes();
|
||||||
|
long totalBlocks = this.getBlocksTotal();
|
||||||
|
out.println(totalInodes + " files and directories, " + totalBlocks
|
||||||
|
+ " blocks = " + (totalInodes + totalBlocks) + " total");
|
||||||
|
|
||||||
|
blockManager.metaSave(out);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String metaSaveAsString() {
|
||||||
|
StringWriter sw = new StringWriter();
|
||||||
|
PrintWriter pw = new PrintWriter(sw);
|
||||||
|
metaSave(pw);
|
||||||
|
pw.flush();
|
||||||
|
return sw.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
long getDefaultBlockSize() {
|
long getDefaultBlockSize() {
|
||||||
return serverDefaults.getBlockSize();
|
return serverDefaults.getBlockSize();
|
||||||
}
|
}
|
||||||
|
@ -3605,6 +3629,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isPopulatingReplQueues() {
|
public boolean isPopulatingReplQueues() {
|
||||||
|
if (!haContext.getState().shouldPopulateReplQueues()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
// safeMode is volatile, and may be set to null at any time
|
// safeMode is volatile, and may be set to null at any time
|
||||||
SafeModeInfo safeMode = this.safeMode;
|
SafeModeInfo safeMode = this.safeMode;
|
||||||
if (safeMode == null)
|
if (safeMode == null)
|
||||||
|
@ -3938,6 +3965,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
|
||||||
return blockManager.getExcessBlocksCount();
|
return blockManager.getExcessBlocksCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Metric
|
||||||
|
public long getPostponedMisreplicatedBlocks() {
|
||||||
|
return blockManager.getPostponedMisreplicatedBlocksCount();
|
||||||
|
}
|
||||||
|
|
||||||
@Metric
|
@Metric
|
||||||
public int getBlockCapacity() {
|
public int getBlockCapacity() {
|
||||||
return blockManager.getCapacity();
|
return blockManager.getCapacity();
|
||||||
|
|
|
@ -41,6 +41,11 @@ public class ActiveState extends HAState {
|
||||||
return; // Other than journal all operations are allowed in active state
|
return; // Other than journal all operations are allowed in active state
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldPopulateReplQueues() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setState(HAContext context, HAState s) throws ServiceFailedException {
|
public void setState(HAContext context, HAState s) throws ServiceFailedException {
|
||||||
if (s == NameNode.STANDBY_STATE) {
|
if (s == NameNode.STANDBY_STATE) {
|
||||||
|
@ -67,4 +72,5 @@ public class ActiveState extends HAState {
|
||||||
throw new ServiceFailedException("Failed to stop active services", e);
|
throw new ServiceFailedException("Failed to stop active services", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -106,9 +106,12 @@ abstract public class HAState {
|
||||||
public abstract void checkOperation(final HAContext context, final OperationCategory op)
|
public abstract void checkOperation(final HAContext context, final OperationCategory op)
|
||||||
throws StandbyException;
|
throws StandbyException;
|
||||||
|
|
||||||
|
public abstract boolean shouldPopulateReplQueues();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return String representation of the service state.
|
* @return String representation of the service state.
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return state.toString();
|
return state.toString();
|
||||||
}
|
}
|
||||||
|
|
|
@ -80,5 +80,10 @@ public class StandbyState extends HAState {
|
||||||
+ context.getState();
|
+ context.getState();
|
||||||
throw new StandbyException(msg);
|
throw new StandbyException(msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean shouldPopulateReplQueues() {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -62,6 +62,7 @@ import org.apache.hadoop.hdfs.protocolR23Compatible.ClientNamenodeWireProtocol;
|
||||||
import org.apache.hadoop.hdfs.server.common.Storage;
|
import org.apache.hadoop.hdfs.server.common.Storage;
|
||||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
|
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
|
import org.apache.hadoop.hdfs.server.datanode.DataStorage;
|
||||||
import org.apache.hadoop.hdfs.server.datanode.FSDatasetInterface;
|
import org.apache.hadoop.hdfs.server.datanode.FSDatasetInterface;
|
||||||
|
@ -1575,6 +1576,30 @@ public class MiniDFSCluster {
|
||||||
getHaServiceClient(nnIndex).transitionToStandby();
|
getHaServiceClient(nnIndex).transitionToStandby();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void triggerBlockReports()
|
||||||
|
throws IOException {
|
||||||
|
for (DataNode dn : getDataNodes()) {
|
||||||
|
DataNodeAdapter.triggerBlockReport(dn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void triggerDeletionReports()
|
||||||
|
throws IOException {
|
||||||
|
for (DataNode dn : getDataNodes()) {
|
||||||
|
DataNodeAdapter.triggerDeletionReport(dn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void triggerHeartbeats()
|
||||||
|
throws IOException {
|
||||||
|
for (DataNode dn : getDataNodes()) {
|
||||||
|
DataNodeAdapter.triggerHeartbeat(dn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Wait until the given namenode gets registration from all the datanodes */
|
/** Wait until the given namenode gets registration from all the datanodes */
|
||||||
public void waitActive(int nnIndex) throws IOException {
|
public void waitActive(int nnIndex) throws IOException {
|
||||||
if (nameNodes.length == 0 || nameNodes[nnIndex] == null) {
|
if (nameNodes.length == 0 || nameNodes[nnIndex] == null) {
|
||||||
|
|
|
@ -122,4 +122,26 @@ public class BlockManagerTestUtil {
|
||||||
return blockManager.computeDatanodeWork();
|
return blockManager.computeDatanodeWork();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int computeInvalidationWork(BlockManager bm) {
|
||||||
|
return bm.computeInvalidateWork(Integer.MAX_VALUE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute all the replication and invalidation work for the
|
||||||
|
* given BlockManager.
|
||||||
|
*
|
||||||
|
* This differs from the above functions in that it computes
|
||||||
|
* replication work for all DNs rather than a particular subset,
|
||||||
|
* regardless of invalidation/replication limit configurations.
|
||||||
|
*
|
||||||
|
* NB: you may want to set
|
||||||
|
* {@link DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY} to
|
||||||
|
* a high value to ensure that all work is calculated.
|
||||||
|
*/
|
||||||
|
public static int computeAllPendingWork(BlockManager bm)
|
||||||
|
throws IOException {
|
||||||
|
int work = computeInvalidationWork(bm);
|
||||||
|
work += bm.computeReplicationWork(Integer.MAX_VALUE);
|
||||||
|
return work;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,8 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.hadoop.hdfs.server.datanode;
|
package org.apache.hadoop.hdfs.server.datanode;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* WARNING!! This is TEST ONLY class: it never has to be used
|
* WARNING!! This is TEST ONLY class: it never has to be used
|
||||||
* for ANY development purposes.
|
* for ANY development purposes.
|
||||||
|
@ -42,4 +44,27 @@ public class DataNodeAdapter {
|
||||||
boolean heartbeatsDisabledForTests) {
|
boolean heartbeatsDisabledForTests) {
|
||||||
dn.setHeartbeatsDisabledForTests(heartbeatsDisabledForTests);
|
dn.setHeartbeatsDisabledForTests(heartbeatsDisabledForTests);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void triggerDeletionReport(DataNode dn) throws IOException {
|
||||||
|
for (BPOfferService bpos : dn.getAllBpOs()) {
|
||||||
|
bpos.triggerDeletionReportForTests();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void triggerHeartbeat(DataNode dn) throws IOException {
|
||||||
|
for (BPOfferService bpos : dn.getAllBpOs()) {
|
||||||
|
bpos.triggerHeartbeatForTests();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void triggerBlockReport(DataNode dn) throws IOException {
|
||||||
|
for (BPOfferService bpos : dn.getAllBpOs()) {
|
||||||
|
bpos.triggerBlockReportForTests();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static long getPendingAsyncDeletions(DataNode dn) {
|
||||||
|
FSDataset fsd = (FSDataset)dn.getFSDataset();
|
||||||
|
return fsd.asyncDiskService.countPendingDeletions();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -81,6 +81,11 @@ public class NameNodeAdapter {
|
||||||
namenode.getNamesystem().leaveSafeMode(checkForUpgrades);
|
namenode.getNamesystem().leaveSafeMode(checkForUpgrades);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void abortEditLogs(NameNode nn) {
|
||||||
|
FSEditLog el = nn.getFSImage().getEditLog();
|
||||||
|
el.abortCurrentLogSegment();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Get the internal RPC server instance.
|
* Get the internal RPC server instance.
|
||||||
* @return rpc server
|
* @return rpc server
|
||||||
|
|
|
@ -0,0 +1,469 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||||
|
|
||||||
|
import static org.junit.Assert.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.PrintWriter;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.commons.logging.impl.Log4JLogger;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.ha.ServiceFailedException;
|
||||||
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
|
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
|
import org.apache.hadoop.hdfs.TestDFSClientFailover;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.Block;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicy;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockPlacementPolicyDefault;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.FSInodeInfo;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
|
import org.apache.hadoop.net.NetworkTopology;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
|
import org.apache.log4j.Level;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Before;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import com.google.common.base.Supplier;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
|
||||||
|
public class TestDNFencing {
|
||||||
|
|
||||||
|
protected static final Log LOG = LogFactory.getLog(
|
||||||
|
TestDNFencing.class);
|
||||||
|
private static final String TEST_FILE_DATA = "hello highly available world";
|
||||||
|
private static final String TEST_FILE = "/testStandbyIsHot";
|
||||||
|
private static final Path TEST_FILE_PATH = new Path(TEST_FILE);
|
||||||
|
private static final int SMALL_BLOCK = 1024;
|
||||||
|
|
||||||
|
private Configuration conf;
|
||||||
|
private MiniDFSCluster cluster;
|
||||||
|
private NameNode nn1, nn2;
|
||||||
|
private FileSystem fs;
|
||||||
|
|
||||||
|
static {
|
||||||
|
((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL);
|
||||||
|
((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL);
|
||||||
|
((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setupCluster() throws Exception {
|
||||||
|
conf = new Configuration();
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, SMALL_BLOCK);
|
||||||
|
// Bump up replication interval so that we only run replication
|
||||||
|
// checks explicitly.
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_INTERVAL_KEY, 600);
|
||||||
|
// Increase max streams so that we re-replicate quickly.
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000);
|
||||||
|
// See RandomDeleterPolicy javadoc.
|
||||||
|
conf.setClass("dfs.block.replicator.classname", RandomDeleterPolicy.class,
|
||||||
|
BlockPlacementPolicy.class);
|
||||||
|
cluster = new MiniDFSCluster.Builder(conf)
|
||||||
|
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||||
|
.numDataNodes(3)
|
||||||
|
.build();
|
||||||
|
nn1 = cluster.getNameNode(0);
|
||||||
|
nn2 = cluster.getNameNode(1);
|
||||||
|
|
||||||
|
cluster.waitActive();
|
||||||
|
cluster.transitionToActive(0);
|
||||||
|
// Trigger block reports so that the first NN trusts all
|
||||||
|
// of the DNs, and will issue deletions
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
nn2.getNamesystem().getEditLogTailer().setSleepTime(250);
|
||||||
|
nn2.getNamesystem().getEditLogTailer().interrupt();
|
||||||
|
fs = TestDFSClientFailover.configureFailoverFs(cluster, conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void shutdownCluster() throws Exception {
|
||||||
|
if (cluster != null) {
|
||||||
|
banner("Shutting down cluster. NN1 metadata:");
|
||||||
|
doMetasave(nn1);
|
||||||
|
banner("Shutting down cluster. NN2 metadata:");
|
||||||
|
doMetasave(nn2);
|
||||||
|
cluster.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDnFencing() throws Exception {
|
||||||
|
// Create a file with replication level 3.
|
||||||
|
DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L);
|
||||||
|
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_FILE_PATH);
|
||||||
|
|
||||||
|
// Drop its replication count to 1, so it becomes over-replicated.
|
||||||
|
// Then compute the invalidation of the extra blocks and trigger
|
||||||
|
// heartbeats so the invalidations are flushed to the DNs.
|
||||||
|
nn1.getRpcServer().setReplication(TEST_FILE, (short) 1);
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn1.getNamesystem().getBlockManager());
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
|
||||||
|
// Transition nn2 to active even though nn1 still thinks it's active.
|
||||||
|
banner("Failing to NN2 but let NN1 continue to think it's active");
|
||||||
|
NameNodeAdapter.abortEditLogs(nn1);
|
||||||
|
NameNodeAdapter.enterSafeMode(nn1, false);
|
||||||
|
cluster.transitionToActive(1);
|
||||||
|
|
||||||
|
// Check that the standby picked up the replication change.
|
||||||
|
assertEquals(1,
|
||||||
|
nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication());
|
||||||
|
|
||||||
|
// Dump some info for debugging purposes.
|
||||||
|
banner("NN2 Metadata immediately after failover");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// Even though NN2 considers the blocks over-replicated, it should
|
||||||
|
// post-pone the block invalidation because the DNs are still "stale".
|
||||||
|
assertEquals(30, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
banner("Triggering heartbeats and block reports so that fencing is completed");
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
|
||||||
|
banner("Metadata after nodes have all block-reported");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// The blocks should no longer be postponed.
|
||||||
|
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
// Wait for NN2 to enact its deletions (replication monitor has to run, etc)
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn2.getNamesystem().getBlockManager());
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
waitForDNDeletions(cluster);
|
||||||
|
cluster.triggerDeletionReports();
|
||||||
|
assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks());
|
||||||
|
assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks());
|
||||||
|
|
||||||
|
banner("Making sure the file is still readable");
|
||||||
|
FileSystem fs2 = cluster.getFileSystem(1);
|
||||||
|
DFSTestUtil.readFile(fs2, TEST_FILE_PATH);
|
||||||
|
|
||||||
|
banner("Waiting for the actual block files to get deleted from DNs.");
|
||||||
|
waitForTrueReplication(cluster, block, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test case which restarts the standby node in such a way that,
|
||||||
|
* when it exits safemode, it will want to invalidate a bunch
|
||||||
|
* of over-replicated block replicas. Ensures that if we failover
|
||||||
|
* at this point it won't lose data.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNNClearsCommandsOnFailoverAfterStartup()
|
||||||
|
throws Exception {
|
||||||
|
// Make lots of blocks to increase chances of triggering a bug.
|
||||||
|
DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)3, 1L);
|
||||||
|
|
||||||
|
banner("Shutting down NN2");
|
||||||
|
cluster.shutdownNameNode(1);
|
||||||
|
|
||||||
|
banner("Setting replication to 1, rolling edit log.");
|
||||||
|
nn1.getRpcServer().setReplication(TEST_FILE, (short) 1);
|
||||||
|
nn1.getRpcServer().rollEditLog();
|
||||||
|
|
||||||
|
// Start NN2 again. When it starts up, it will see all of the
|
||||||
|
// blocks as over-replicated, since it has the metadata for
|
||||||
|
// replication=1, but the DNs haven't yet processed the deletions.
|
||||||
|
banner("Starting NN2 again.");
|
||||||
|
cluster.restartNameNode(1);
|
||||||
|
nn2 = cluster.getNameNode(1);
|
||||||
|
|
||||||
|
banner("triggering BRs");
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
|
||||||
|
// We expect that both NN1 and NN2 will have some number of
|
||||||
|
// deletions queued up for the DNs.
|
||||||
|
banner("computing invalidation on nn1");
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn1.getNamesystem().getBlockManager());
|
||||||
|
|
||||||
|
banner("computing invalidation on nn2");
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn2.getNamesystem().getBlockManager());
|
||||||
|
|
||||||
|
// Dump some info for debugging purposes.
|
||||||
|
banner("Metadata immediately before failover");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
|
||||||
|
// Transition nn2 to active even though nn1 still thinks it's active
|
||||||
|
banner("Failing to NN2 but let NN1 continue to think it's active");
|
||||||
|
NameNodeAdapter.abortEditLogs(nn1);
|
||||||
|
NameNodeAdapter.enterSafeMode(nn1, false);
|
||||||
|
|
||||||
|
cluster.transitionToActive(1);
|
||||||
|
|
||||||
|
// Check that the standby picked up the replication change.
|
||||||
|
assertEquals(1,
|
||||||
|
nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication());
|
||||||
|
|
||||||
|
// Dump some info for debugging purposes.
|
||||||
|
banner("Metadata immediately after failover");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
banner("Triggering heartbeats and block reports so that fencing is completed");
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
|
||||||
|
banner("Metadata after nodes have all block-reported");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// The block should no longer be postponed.
|
||||||
|
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
// Wait for NN2 to enact its deletions (replication monitor has to run, etc)
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn2.getNamesystem().getBlockManager());
|
||||||
|
|
||||||
|
waitForNNToIssueDeletions(nn2);
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
waitForDNDeletions(cluster);
|
||||||
|
cluster.triggerDeletionReports();
|
||||||
|
assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks());
|
||||||
|
assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks());
|
||||||
|
|
||||||
|
banner("Making sure the file is still readable");
|
||||||
|
FileSystem fs2 = cluster.getFileSystem(1);
|
||||||
|
DFSTestUtil.readFile(fs2, TEST_FILE_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test case that reduces replication of a file with a lot of blocks
|
||||||
|
* and then fails over right after those blocks enter the DN invalidation
|
||||||
|
* queues on the active. Ensures that fencing is correct and no replicas
|
||||||
|
* are lost.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNNClearsCommandsOnFailoverWithReplChanges()
|
||||||
|
throws Exception {
|
||||||
|
// Make lots of blocks to increase chances of triggering a bug.
|
||||||
|
DFSTestUtil.createFile(fs, TEST_FILE_PATH, 30*SMALL_BLOCK, (short)1, 1L);
|
||||||
|
|
||||||
|
banner("rolling NN1's edit log, forcing catch-up");
|
||||||
|
TestEditLogTailer.waitForStandbyToCatchUp(nn1, nn2);
|
||||||
|
|
||||||
|
// Get some new replicas reported so that NN2 now considers
|
||||||
|
// them over-replicated and schedules some more deletions
|
||||||
|
nn1.getRpcServer().setReplication(TEST_FILE, (short) 2);
|
||||||
|
while (BlockManagerTestUtil.getComputedDatanodeWork(
|
||||||
|
nn1.getNamesystem().getBlockManager()) > 0) {
|
||||||
|
LOG.info("Getting more replication work computed");
|
||||||
|
}
|
||||||
|
BlockManager bm1 = nn1.getNamesystem().getBlockManager();
|
||||||
|
while (bm1.getPendingReplicationBlocksCount() > 0) {
|
||||||
|
BlockManagerTestUtil.updateState(bm1);
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
Thread.sleep(1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
banner("triggering BRs");
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
|
||||||
|
nn1.getRpcServer().setReplication(TEST_FILE, (short) 1);
|
||||||
|
|
||||||
|
|
||||||
|
banner("computing invalidation on nn1");
|
||||||
|
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn1.getNamesystem().getBlockManager());
|
||||||
|
doMetasave(nn1);
|
||||||
|
|
||||||
|
banner("computing invalidation on nn2");
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn2.getNamesystem().getBlockManager());
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// Dump some info for debugging purposes.
|
||||||
|
banner("Metadata immediately before failover");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
|
||||||
|
// Transition nn2 to active even though nn1 still thinks it's active
|
||||||
|
banner("Failing to NN2 but let NN1 continue to think it's active");
|
||||||
|
NameNodeAdapter.abortEditLogs(nn1);
|
||||||
|
NameNodeAdapter.enterSafeMode(nn1, false);
|
||||||
|
|
||||||
|
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn2.getNamesystem().getBlockManager());
|
||||||
|
cluster.transitionToActive(1);
|
||||||
|
|
||||||
|
// Check that the standby picked up the replication change.
|
||||||
|
assertEquals(1,
|
||||||
|
nn2.getRpcServer().getFileInfo(TEST_FILE).getReplication());
|
||||||
|
|
||||||
|
// Dump some info for debugging purposes.
|
||||||
|
banner("Metadata immediately after failover");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
banner("Triggering heartbeats and block reports so that fencing is completed");
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
|
||||||
|
banner("Metadata after nodes have all block-reported");
|
||||||
|
doMetasave(nn2);
|
||||||
|
|
||||||
|
// The block should no longer be postponed.
|
||||||
|
assertEquals(0, nn2.getNamesystem().getPostponedMisreplicatedBlocks());
|
||||||
|
|
||||||
|
// Wait for NN2 to enact its deletions (replication monitor has to run, etc)
|
||||||
|
BlockManagerTestUtil.computeInvalidationWork(
|
||||||
|
nn2.getNamesystem().getBlockManager());
|
||||||
|
|
||||||
|
waitForNNToIssueDeletions(nn2);
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
waitForDNDeletions(cluster);
|
||||||
|
cluster.triggerDeletionReports();
|
||||||
|
assertEquals(0, nn2.getNamesystem().getUnderReplicatedBlocks());
|
||||||
|
assertEquals(0, nn2.getNamesystem().getPendingReplicationBlocks());
|
||||||
|
|
||||||
|
banner("Making sure the file is still readable");
|
||||||
|
FileSystem fs2 = cluster.getFileSystem(1);
|
||||||
|
DFSTestUtil.readFile(fs2, TEST_FILE_PATH);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Print a big banner in the test log to make debug easier.
|
||||||
|
*/
|
||||||
|
private void banner(String string) {
|
||||||
|
LOG.info("\n\n\n\n================================================\n" +
|
||||||
|
string + "\n" +
|
||||||
|
"==================================================\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doMetasave(NameNode nn2) {
|
||||||
|
nn2.getNamesystem().writeLock();
|
||||||
|
try {
|
||||||
|
PrintWriter pw = new PrintWriter(System.err);
|
||||||
|
nn2.getNamesystem().getBlockManager().metaSave(pw);
|
||||||
|
pw.flush();
|
||||||
|
} finally {
|
||||||
|
nn2.getNamesystem().writeUnlock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForTrueReplication(final MiniDFSCluster cluster,
|
||||||
|
final ExtendedBlock block, final int waitFor) throws Exception {
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
try {
|
||||||
|
return getTrueReplication(cluster, block) == waitFor;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 500, 10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int getTrueReplication(MiniDFSCluster cluster, ExtendedBlock block)
|
||||||
|
throws IOException {
|
||||||
|
int count = 0;
|
||||||
|
for (DataNode dn : cluster.getDataNodes()) {
|
||||||
|
if (dn.getFSDataset().getStoredBlock(block.getBlockPoolId(), block.getBlockId()) != null) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForDNDeletions(final MiniDFSCluster cluster)
|
||||||
|
throws TimeoutException, InterruptedException {
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
for (DataNode dn : cluster.getDataNodes()) {
|
||||||
|
if (DataNodeAdapter.getPendingAsyncDeletions(dn) > 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}, 1000, 10000);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForNNToIssueDeletions(final NameNode nn)
|
||||||
|
throws Exception {
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
LOG.info("Waiting for NN to issue block deletions to DNs");
|
||||||
|
return nn.getNamesystem().getBlockManager().getPendingDeletionBlocksCount() == 0;
|
||||||
|
}
|
||||||
|
}, 250, 10000);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A BlockPlacementPolicy which, rather than using space available, makes
|
||||||
|
* random decisions about which excess replica to delete. This is because,
|
||||||
|
* in the test cases, the two NNs will usually (but not quite always)
|
||||||
|
* make the same decision of which replica to delete. The fencing issues
|
||||||
|
* are exacerbated when the two NNs make different decisions, which can
|
||||||
|
* happen in "real life" when they have slightly out-of-sync heartbeat
|
||||||
|
* information regarding disk usage.
|
||||||
|
*/
|
||||||
|
public static class RandomDeleterPolicy extends BlockPlacementPolicyDefault {
|
||||||
|
|
||||||
|
public RandomDeleterPolicy() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DatanodeDescriptor chooseReplicaToDelete(FSInodeInfo inode,
|
||||||
|
Block block, short replicationFactor,
|
||||||
|
Collection<DatanodeDescriptor> first,
|
||||||
|
Collection<DatanodeDescriptor> second) {
|
||||||
|
|
||||||
|
Collection<DatanodeDescriptor> chooseFrom =
|
||||||
|
!first.isEmpty() ? first : second;
|
||||||
|
|
||||||
|
List<DatanodeDescriptor> l = Lists.newArrayList(chooseFrom);
|
||||||
|
return l.get(DFSUtil.getRandom().nextInt(l.size()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,204 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.hdfs.server.namenode.ha;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.commons.logging.impl.Log4JLogger;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.BlockLocation;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
|
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
|
import org.apache.hadoop.hdfs.TestDFSClientFailover;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
|
import org.apache.hadoop.ipc.Server;
|
||||||
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
|
import org.apache.hadoop.test.MultithreadedTestUtil.RepeatingTestThread;
|
||||||
|
import org.apache.hadoop.test.MultithreadedTestUtil.TestContext;
|
||||||
|
import org.apache.log4j.Level;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import com.google.common.base.Supplier;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Stress-test for potential bugs when replication is changing
|
||||||
|
* on blocks during a failover.
|
||||||
|
*/
|
||||||
|
public class TestDNFencingWithReplication {
|
||||||
|
static {
|
||||||
|
((Log4JLogger)FSNamesystem.auditLog).getLogger().setLevel(Level.WARN);
|
||||||
|
((Log4JLogger)Server.LOG).getLogger().setLevel(Level.FATAL);
|
||||||
|
((Log4JLogger)LogFactory.getLog(
|
||||||
|
"org.apache.hadoop.io.retry.RetryInvocationHandler"))
|
||||||
|
.getLogger().setLevel(Level.FATAL);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static final int NUM_THREADS = 20;
|
||||||
|
// How long should the test try to run for. In practice
|
||||||
|
// it runs for ~20-30s longer than this constant due to startup/
|
||||||
|
// shutdown time.
|
||||||
|
private static final long RUNTIME = 35000;
|
||||||
|
private static final int BLOCK_SIZE = 1024;
|
||||||
|
|
||||||
|
private static class ReplicationToggler extends RepeatingTestThread {
|
||||||
|
private final FileSystem fs;
|
||||||
|
private final Path path;
|
||||||
|
|
||||||
|
public ReplicationToggler(TestContext ctx, FileSystem fs, Path p) {
|
||||||
|
super(ctx);
|
||||||
|
this.fs = fs;
|
||||||
|
this.path = p;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doAnAction() throws Exception {
|
||||||
|
fs.setReplication(path, (short)1);
|
||||||
|
waitForReplicas(1);
|
||||||
|
fs.setReplication(path, (short)2);
|
||||||
|
waitForReplicas(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void waitForReplicas(final int replicas) throws Exception {
|
||||||
|
try {
|
||||||
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@Override
|
||||||
|
public Boolean get() {
|
||||||
|
try {
|
||||||
|
BlockLocation[] blocks = fs.getFileBlockLocations(path, 0, 10);
|
||||||
|
Assert.assertEquals(1, blocks.length);
|
||||||
|
return blocks[0].getHosts().length == replicas;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, 100, 60000);
|
||||||
|
} catch (TimeoutException te) {
|
||||||
|
throw new IOException("Timed out waiting for " + replicas + " replicas " +
|
||||||
|
"on path " + path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "Toggler for " + path;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFencingStress() throws Exception {
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000);
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, 1);
|
||||||
|
// Increase max streams so that we re-replicate quickly.
|
||||||
|
conf.setInt(DFSConfigKeys.DFS_NAMENODE_REPLICATION_MAX_STREAMS_KEY, 1000);
|
||||||
|
|
||||||
|
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
|
||||||
|
.nnTopology(MiniDFSNNTopology.simpleHATopology())
|
||||||
|
.numDataNodes(3)
|
||||||
|
.build();
|
||||||
|
try {
|
||||||
|
cluster.waitActive();
|
||||||
|
cluster.transitionToActive(0);
|
||||||
|
|
||||||
|
final NameNode nn1 = cluster.getNameNode(0);
|
||||||
|
final NameNode nn2 = cluster.getNameNode(1);
|
||||||
|
nn2.getNamesystem().getEditLogTailer().setSleepTime(250);
|
||||||
|
nn2.getNamesystem().getEditLogTailer().interrupt();
|
||||||
|
|
||||||
|
FileSystem fs = TestDFSClientFailover.configureFailoverFs(
|
||||||
|
cluster, conf);
|
||||||
|
TestContext togglers = new TestContext();
|
||||||
|
for (int i = 0; i < NUM_THREADS; i++) {
|
||||||
|
Path p = new Path("/test-" + i);
|
||||||
|
DFSTestUtil.createFile(fs, p, BLOCK_SIZE*10, (short)3, (long)i);
|
||||||
|
togglers.addThread(new ReplicationToggler(togglers, fs, p));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start a separate thread which will make sure that replication
|
||||||
|
// happens quickly by triggering deletion reports and replication
|
||||||
|
// work calculation frequently.
|
||||||
|
TestContext triggerCtx = new TestContext();
|
||||||
|
triggerCtx.addThread(new RepeatingTestThread(triggerCtx) {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doAnAction() throws Exception {
|
||||||
|
for (DataNode dn : cluster.getDataNodes()) {
|
||||||
|
DataNodeAdapter.triggerDeletionReport(dn);
|
||||||
|
DataNodeAdapter.triggerHeartbeat(dn);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
NameNode nn = cluster.getNameNode(i);
|
||||||
|
BlockManagerTestUtil.computeAllPendingWork(
|
||||||
|
nn.getNamesystem().getBlockManager());
|
||||||
|
}
|
||||||
|
Thread.sleep(500);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
triggerCtx.addThread(new RepeatingTestThread(triggerCtx) {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void doAnAction() throws Exception {
|
||||||
|
System.err.println("==============================\n" +
|
||||||
|
"Failing over from 0->1\n" +
|
||||||
|
"==================================");
|
||||||
|
cluster.transitionToStandby(0);
|
||||||
|
cluster.transitionToActive(1);
|
||||||
|
|
||||||
|
Thread.sleep(5000);
|
||||||
|
System.err.println("==============================\n" +
|
||||||
|
"Failing over from 1->0\n" +
|
||||||
|
"==================================");
|
||||||
|
|
||||||
|
cluster.transitionToStandby(1);
|
||||||
|
cluster.transitionToActive(0);
|
||||||
|
Thread.sleep(5000);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
triggerCtx.startThreads();
|
||||||
|
togglers.startThreads();
|
||||||
|
|
||||||
|
togglers.waitFor(RUNTIME);
|
||||||
|
togglers.stop();
|
||||||
|
triggerCtx.stop();
|
||||||
|
|
||||||
|
// CHeck that the files can be read without throwing
|
||||||
|
for (int i = 0; i < NUM_THREADS; i++) {
|
||||||
|
Path p = new Path("/test-" + i);
|
||||||
|
DFSTestUtil.readFile(fs, p);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
System.err.println("===========================\n\n\n\n");
|
||||||
|
cluster.shutdown();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -108,8 +108,7 @@ public class TestEditLogTailer {
|
||||||
long activeTxId = active.getNamesystem().getFSImage().getEditLog()
|
long activeTxId = active.getNamesystem().getFSImage().getEditLog()
|
||||||
.getLastWrittenTxId();
|
.getLastWrittenTxId();
|
||||||
|
|
||||||
// TODO: we should really just ask for a log roll here
|
active.getRpcServer().rollEditLog();
|
||||||
doSaveNamespace(active);
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
while (System.currentTimeMillis() - start < NN_LAG_TIMEOUT) {
|
while (System.currentTimeMillis() - start < NN_LAG_TIMEOUT) {
|
||||||
|
@ -124,12 +123,4 @@ public class TestEditLogTailer {
|
||||||
" (currently at " +
|
" (currently at " +
|
||||||
standby.getNamesystem().getFSImage().getLastAppliedTxId() + ")");
|
standby.getNamesystem().getFSImage().getLastAppliedTxId() + ")");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void doSaveNamespace(NameNode nn)
|
|
||||||
throws IOException {
|
|
||||||
NameNodeAdapter.enterSafeMode(nn, false);
|
|
||||||
NameNodeAdapter.saveNamespace(nn);
|
|
||||||
NameNodeAdapter.leaveSafeMode(nn, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import java.util.concurrent.TimeoutException;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.commons.logging.impl.Log4JLogger;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
@ -33,10 +34,17 @@ import org.apache.hadoop.hdfs.HAUtil;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||||
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
import org.apache.hadoop.hdfs.MiniDFSNNTopology;
|
||||||
import org.apache.hadoop.hdfs.TestDFSClientFailover;
|
import org.apache.hadoop.hdfs.TestDFSClientFailover;
|
||||||
|
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
|
||||||
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
|
||||||
|
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
||||||
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeAdapter;
|
||||||
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||||
import org.apache.hadoop.test.GenericTestUtils;
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
|
import org.apache.log4j.Level;
|
||||||
|
import org.junit.Assert;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
import com.google.common.base.Supplier;
|
import com.google.common.base.Supplier;
|
||||||
|
@ -52,6 +60,12 @@ public class TestStandbyIsHot {
|
||||||
private static final String TEST_FILE = "/testStandbyIsHot";
|
private static final String TEST_FILE = "/testStandbyIsHot";
|
||||||
private static final Path TEST_FILE_PATH = new Path(TEST_FILE);
|
private static final Path TEST_FILE_PATH = new Path(TEST_FILE);
|
||||||
|
|
||||||
|
static {
|
||||||
|
((Log4JLogger)LogFactory.getLog(FSNamesystem.class)).getLogger().setLevel(Level.ALL);
|
||||||
|
((Log4JLogger)LogFactory.getLog(BlockManager.class)).getLogger().setLevel(Level.ALL);
|
||||||
|
((Log4JLogger)NameNode.stateChangeLog).getLogger().setLevel(Level.ALL);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testStandbyIsHot() throws Exception {
|
public void testStandbyIsHot() throws Exception {
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
|
@ -79,19 +93,40 @@ public class TestStandbyIsHot {
|
||||||
nn1.getRpcServer().rollEditLog();
|
nn1.getRpcServer().rollEditLog();
|
||||||
System.err.println("==================================");
|
System.err.println("==================================");
|
||||||
|
|
||||||
waitForBlockLocations(nn2, TEST_FILE, 3);
|
// Block locations should show up on standby.
|
||||||
|
LOG.info("Waiting for block locations to appear on standby node");
|
||||||
|
waitForBlockLocations(cluster, nn2, TEST_FILE, 3);
|
||||||
|
|
||||||
nn1.stop();
|
// Trigger immediate heartbeats and block reports so
|
||||||
cluster.transitionToActive(1);
|
// that the active "trusts" all of the DNs
|
||||||
|
cluster.triggerHeartbeats();
|
||||||
|
cluster.triggerBlockReports();
|
||||||
|
|
||||||
assertEquals(TEST_FILE_DATA, DFSTestUtil.readFile(fs, TEST_FILE_PATH));
|
// Change replication
|
||||||
|
LOG.info("Changing replication to 1");
|
||||||
|
fs.setReplication(TEST_FILE_PATH, (short)1);
|
||||||
|
waitForBlockLocations(cluster, nn1, TEST_FILE, 1);
|
||||||
|
|
||||||
|
nn1.getRpcServer().rollEditLog();
|
||||||
|
|
||||||
|
LOG.info("Waiting for lowered replication to show up on standby");
|
||||||
|
waitForBlockLocations(cluster, nn2, TEST_FILE, 1);
|
||||||
|
|
||||||
|
// Change back to 3
|
||||||
|
LOG.info("Changing replication to 3");
|
||||||
|
fs.setReplication(TEST_FILE_PATH, (short)3);
|
||||||
|
nn1.getRpcServer().rollEditLog();
|
||||||
|
|
||||||
|
LOG.info("Waiting for higher replication to show up on standby");
|
||||||
|
waitForBlockLocations(cluster, nn2, TEST_FILE, 3);
|
||||||
|
|
||||||
} finally {
|
} finally {
|
||||||
cluster.shutdown();
|
cluster.shutdown();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void waitForBlockLocations(final NameNode nn,
|
static void waitForBlockLocations(final MiniDFSCluster cluster,
|
||||||
|
final NameNode nn,
|
||||||
final String path, final int expectedReplicas)
|
final String path, final int expectedReplicas)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||||
|
@ -100,8 +135,19 @@ public class TestStandbyIsHot {
|
||||||
public Boolean get() {
|
public Boolean get() {
|
||||||
try {
|
try {
|
||||||
LocatedBlocks locs = NameNodeAdapter.getBlockLocations(nn, path, 0, 1000);
|
LocatedBlocks locs = NameNodeAdapter.getBlockLocations(nn, path, 0, 1000);
|
||||||
LOG.info("Got locs: " + locs);
|
DatanodeInfo[] dnis = locs.getLastLocatedBlock().getLocations();
|
||||||
return locs.getLastLocatedBlock().getLocations().length == expectedReplicas;
|
for (DatanodeInfo dni : dnis) {
|
||||||
|
Assert.assertNotNull(dni);
|
||||||
|
}
|
||||||
|
int numReplicas = dnis.length;
|
||||||
|
|
||||||
|
LOG.info("Got " + numReplicas + " locs: " + locs);
|
||||||
|
if (numReplicas > expectedReplicas) {
|
||||||
|
for (DataNode dn : cluster.getDataNodes()) {
|
||||||
|
DataNodeAdapter.triggerDeletionReport(dn);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return numReplicas == expectedReplicas;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.warn("No block locations yet: " + e.getMessage());
|
LOG.warn("No block locations yet: " + e.getMessage());
|
||||||
return false;
|
return false;
|
||||||
|
|
Loading…
Reference in New Issue