HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. Contributed by Ming Ma.

(cherry picked from commit 8104d52269)
This commit is contained in:
Kihwal Lee 2015-10-27 11:09:05 -05:00
parent 653ef52ef2
commit 1d23e1ec07
4 changed files with 41 additions and 24 deletions

View File

@ -28,6 +28,9 @@ Release 2.7.2 - UNRELEASED
HDFS-6945. BlockManager should remove a block from excessReplicateMap and HDFS-6945. BlockManager should remove a block from excessReplicateMap and
decrement ExcessBlocks metric when the block is removed. (aajisaka) decrement ExcessBlocks metric when the block is removed. (aajisaka)
HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail.
(Ming Ma via wang)
HDFS-8806. Inconsistent metrics: number of missing blocks with replication HDFS-8806. Inconsistent metrics: number of missing blocks with replication
factor 1 not properly cleared. (Zhe Zhang via aajisaka) factor 1 not properly cleared. (Zhe Zhang via aajisaka)

View File

@ -197,23 +197,21 @@ public class DecommissionManager {
*/ */
@VisibleForTesting @VisibleForTesting
public void startDecommission(DatanodeDescriptor node) { public void startDecommission(DatanodeDescriptor node) {
if (!node.isDecommissionInProgress()) { if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
if (!node.isAlive) { // Update DN stats maintained by HeartbeatManager
LOG.info("Dead node {} is decommissioned immediately.", node); hbManager.startDecommission(node);
node.setDecommissioned(); // hbManager.startDecommission will set dead node to decommissioned.
} else if (!node.isDecommissioned()) { if (node.isDecommissionInProgress()) {
for (DatanodeStorageInfo storage : node.getStorageInfos()) { for (DatanodeStorageInfo storage : node.getStorageInfos()) {
LOG.info("Starting decommission of {} {} with {} blocks", LOG.info("Starting decommission of {} {} with {} blocks",
node, storage, storage.numBlocks()); node, storage, storage.numBlocks());
} }
// Update DN stats maintained by HeartbeatManager
hbManager.startDecommission(node);
node.decommissioningStatus.setStartTime(monotonicNow()); node.decommissioningStatus.setStartTime(monotonicNow());
pendingNodes.add(node); pendingNodes.add(node);
} }
} else { } else {
LOG.trace("startDecommission: Node {} is already decommission in " LOG.trace("startDecommission: Node {} in {}, nothing to do." +
+ "progress, nothing to do.", node); node, node.getAdminState());
} }
} }
@ -221,9 +219,9 @@ public class DecommissionManager {
* Stop decommissioning the specified datanode. * Stop decommissioning the specified datanode.
* @param node * @param node
*/ */
void stopDecommission(DatanodeDescriptor node) { @VisibleForTesting
public void stopDecommission(DatanodeDescriptor node) {
if (node.isDecommissionInProgress() || node.isDecommissioned()) { if (node.isDecommissionInProgress() || node.isDecommissioned()) {
LOG.info("Stopping decommissioning of node {}", node);
// Update DN stats maintained by HeartbeatManager // Update DN stats maintained by HeartbeatManager
hbManager.stopDecommission(node); hbManager.stopDecommission(node);
// Over-replicated blocks will be detected and processed when // Over-replicated blocks will be detected and processed when
@ -235,8 +233,8 @@ public class DecommissionManager {
pendingNodes.remove(node); pendingNodes.remove(node);
decomNodeBlocks.remove(node); decomNodeBlocks.remove(node);
} else { } else {
LOG.trace("stopDecommission: Node {} is not decommission in progress " + LOG.trace("stopDecommission: Node {} in {}, nothing to do." +
"or decommissioned, nothing to do.", node); node, node.getAdminState());
} }
} }

View File

@ -20,8 +20,6 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
@ -31,6 +29,8 @@ import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* Manage the heartbeats received from datanodes. * Manage the heartbeats received from datanodes.
@ -38,7 +38,7 @@ import org.apache.hadoop.util.Time;
* by the heartbeat manager lock. * by the heartbeat manager lock.
*/ */
class HeartbeatManager implements DatanodeStatistics { class HeartbeatManager implements DatanodeStatistics {
static final Log LOG = LogFactory.getLog(HeartbeatManager.class); static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
/** /**
* Stores a subset of the datanodeMap in DatanodeManager, * Stores a subset of the datanodeMap in DatanodeManager,
@ -227,16 +227,27 @@ class HeartbeatManager implements DatanodeStatistics {
} }
synchronized void startDecommission(final DatanodeDescriptor node) { synchronized void startDecommission(final DatanodeDescriptor node) {
if (!node.isAlive) {
LOG.info("Dead node {} is decommissioned immediately.", node);
node.setDecommissioned();
} else {
stats.subtract(node); stats.subtract(node);
node.startDecommission(); node.startDecommission();
stats.add(node); stats.add(node);
} }
}
synchronized void stopDecommission(final DatanodeDescriptor node) { synchronized void stopDecommission(final DatanodeDescriptor node) {
LOG.info("Stopping decommissioning of {} node {}",
node.isAlive ? "live" : "dead", node);
if (!node.isAlive) {
node.stopDecommission();
} else {
stats.subtract(node); stats.subtract(node);
node.stopDecommission(); node.stopDecommission();
stats.add(node); stats.add(node);
} }
}
/** /**
* Check if there are any expired heartbeats, and if so, * Check if there are any expired heartbeats, and if so,

View File

@ -202,9 +202,14 @@ public class TestNamenodeCapacityReport {
dn.shutdown(); dn.shutdown();
DFSTestUtil.setDatanodeDead(dnd); DFSTestUtil.setDatanodeDead(dnd);
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager()); BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
//Verify decommission of dead node won't impact nodesInService metrics.
dnm.getDecomManager().startDecommission(dnd);
expectedInServiceNodes--; expectedInServiceNodes--;
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes()); assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem)); assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
//Verify recommission of dead node won't impact nodesInService metrics.
dnm.getDecomManager().stopDecommission(dnd);
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
} }
// restart the nodes to verify that counts are correct after // restart the nodes to verify that counts are correct after