HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. Contributed by Ming Ma.

This commit is contained in:
Andrew Wang 2015-04-08 15:52:06 -07:00
parent 9792500c54
commit 8104d52269
4 changed files with 41 additions and 24 deletions

View File

@ -126,6 +126,9 @@ Release 2.8.0 - UNRELEASED
HDFS-5215. dfs.datanode.du.reserved is not considered while computing HDFS-5215. dfs.datanode.du.reserved is not considered while computing
available space ( Brahma Reddy Battula via Yongjun Zhang) available space ( Brahma Reddy Battula via Yongjun Zhang)
HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail.
(Ming Ma via wang)
Release 2.7.0 - UNRELEASED Release 2.7.0 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -197,23 +197,21 @@ void close() {
*/ */
@VisibleForTesting @VisibleForTesting
public void startDecommission(DatanodeDescriptor node) { public void startDecommission(DatanodeDescriptor node) {
if (!node.isDecommissionInProgress()) { if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
if (!node.isAlive) { // Update DN stats maintained by HeartbeatManager
LOG.info("Dead node {} is decommissioned immediately.", node); hbManager.startDecommission(node);
node.setDecommissioned(); // hbManager.startDecommission will set dead node to decommissioned.
} else if (!node.isDecommissioned()) { if (node.isDecommissionInProgress()) {
for (DatanodeStorageInfo storage : node.getStorageInfos()) { for (DatanodeStorageInfo storage : node.getStorageInfos()) {
LOG.info("Starting decommission of {} {} with {} blocks", LOG.info("Starting decommission of {} {} with {} blocks",
node, storage, storage.numBlocks()); node, storage, storage.numBlocks());
} }
// Update DN stats maintained by HeartbeatManager
hbManager.startDecommission(node);
node.decommissioningStatus.setStartTime(monotonicNow()); node.decommissioningStatus.setStartTime(monotonicNow());
pendingNodes.add(node); pendingNodes.add(node);
} }
} else { } else {
LOG.trace("startDecommission: Node {} is already decommission in " LOG.trace("startDecommission: Node {} in {}, nothing to do." +
+ "progress, nothing to do.", node); node, node.getAdminState());
} }
} }
@ -221,9 +219,9 @@ public void startDecommission(DatanodeDescriptor node) {
* Stop decommissioning the specified datanode. * Stop decommissioning the specified datanode.
* @param node * @param node
*/ */
void stopDecommission(DatanodeDescriptor node) { @VisibleForTesting
public void stopDecommission(DatanodeDescriptor node) {
if (node.isDecommissionInProgress() || node.isDecommissioned()) { if (node.isDecommissionInProgress() || node.isDecommissioned()) {
LOG.info("Stopping decommissioning of node {}", node);
// Update DN stats maintained by HeartbeatManager // Update DN stats maintained by HeartbeatManager
hbManager.stopDecommission(node); hbManager.stopDecommission(node);
// Over-replicated blocks will be detected and processed when // Over-replicated blocks will be detected and processed when
@ -235,8 +233,8 @@ void stopDecommission(DatanodeDescriptor node) {
pendingNodes.remove(node); pendingNodes.remove(node);
decomNodeBlocks.remove(node); decomNodeBlocks.remove(node);
} else { } else {
LOG.trace("stopDecommission: Node {} is not decommission in progress " + LOG.trace("stopDecommission: Node {} in {}, nothing to do." +
"or decommissioned, nothing to do.", node); node, node.getAdminState());
} }
} }

View File

@ -20,8 +20,6 @@
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
@ -31,6 +29,8 @@
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary; import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* Manage the heartbeats received from datanodes. * Manage the heartbeats received from datanodes.
@ -38,7 +38,7 @@
* by the heartbeat manager lock. * by the heartbeat manager lock.
*/ */
class HeartbeatManager implements DatanodeStatistics { class HeartbeatManager implements DatanodeStatistics {
static final Log LOG = LogFactory.getLog(HeartbeatManager.class); static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
/** /**
* Stores a subset of the datanodeMap in DatanodeManager, * Stores a subset of the datanodeMap in DatanodeManager,
@ -227,16 +227,27 @@ synchronized void updateHeartbeat(final DatanodeDescriptor node,
} }
synchronized void startDecommission(final DatanodeDescriptor node) { synchronized void startDecommission(final DatanodeDescriptor node) {
if (!node.isAlive) {
LOG.info("Dead node {} is decommissioned immediately.", node);
node.setDecommissioned();
} else {
stats.subtract(node); stats.subtract(node);
node.startDecommission(); node.startDecommission();
stats.add(node); stats.add(node);
} }
}
synchronized void stopDecommission(final DatanodeDescriptor node) { synchronized void stopDecommission(final DatanodeDescriptor node) {
LOG.info("Stopping decommissioning of {} node {}",
node.isAlive ? "live" : "dead", node);
if (!node.isAlive) {
node.stopDecommission();
} else {
stats.subtract(node); stats.subtract(node);
node.stopDecommission(); node.stopDecommission();
stats.add(node); stats.add(node);
} }
}
/** /**
* Check if there are any expired heartbeats, and if so, * Check if there are any expired heartbeats, and if so,

View File

@ -202,9 +202,14 @@ public void testXceiverCount() throws Exception {
dn.shutdown(); dn.shutdown();
DFSTestUtil.setDatanodeDead(dnd); DFSTestUtil.setDatanodeDead(dnd);
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager()); BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
//Verify decommission of dead node won't impact nodesInService metrics.
dnm.getDecomManager().startDecommission(dnd);
expectedInServiceNodes--; expectedInServiceNodes--;
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes()); assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem)); assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
//Verify recommission of dead node won't impact nodesInService metrics.
dnm.getDecomManager().stopDecommission(dnd);
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
} }
// restart the nodes to verify that counts are correct after // restart the nodes to verify that counts are correct after