HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. Contributed by Ming Ma.
This commit is contained in:
parent
9792500c54
commit
8104d52269
|
@ -126,6 +126,9 @@ Release 2.8.0 - UNRELEASED
|
||||||
HDFS-5215. dfs.datanode.du.reserved is not considered while computing
|
HDFS-5215. dfs.datanode.du.reserved is not considered while computing
|
||||||
available space ( Brahma Reddy Battula via Yongjun Zhang)
|
available space ( Brahma Reddy Battula via Yongjun Zhang)
|
||||||
|
|
||||||
|
HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail.
|
||||||
|
(Ming Ma via wang)
|
||||||
|
|
||||||
Release 2.7.0 - UNRELEASED
|
Release 2.7.0 - UNRELEASED
|
||||||
|
|
||||||
INCOMPATIBLE CHANGES
|
INCOMPATIBLE CHANGES
|
||||||
|
|
|
@ -197,23 +197,21 @@ public class DecommissionManager {
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public void startDecommission(DatanodeDescriptor node) {
|
public void startDecommission(DatanodeDescriptor node) {
|
||||||
if (!node.isDecommissionInProgress()) {
|
if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
|
||||||
if (!node.isAlive) {
|
// Update DN stats maintained by HeartbeatManager
|
||||||
LOG.info("Dead node {} is decommissioned immediately.", node);
|
hbManager.startDecommission(node);
|
||||||
node.setDecommissioned();
|
// hbManager.startDecommission will set dead node to decommissioned.
|
||||||
} else if (!node.isDecommissioned()) {
|
if (node.isDecommissionInProgress()) {
|
||||||
for (DatanodeStorageInfo storage : node.getStorageInfos()) {
|
for (DatanodeStorageInfo storage : node.getStorageInfos()) {
|
||||||
LOG.info("Starting decommission of {} {} with {} blocks",
|
LOG.info("Starting decommission of {} {} with {} blocks",
|
||||||
node, storage, storage.numBlocks());
|
node, storage, storage.numBlocks());
|
||||||
}
|
}
|
||||||
// Update DN stats maintained by HeartbeatManager
|
|
||||||
hbManager.startDecommission(node);
|
|
||||||
node.decommissioningStatus.setStartTime(monotonicNow());
|
node.decommissioningStatus.setStartTime(monotonicNow());
|
||||||
pendingNodes.add(node);
|
pendingNodes.add(node);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
LOG.trace("startDecommission: Node {} is already decommission in "
|
LOG.trace("startDecommission: Node {} in {}, nothing to do." +
|
||||||
+ "progress, nothing to do.", node);
|
node, node.getAdminState());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -221,9 +219,9 @@ public class DecommissionManager {
|
||||||
* Stop decommissioning the specified datanode.
|
* Stop decommissioning the specified datanode.
|
||||||
* @param node
|
* @param node
|
||||||
*/
|
*/
|
||||||
void stopDecommission(DatanodeDescriptor node) {
|
@VisibleForTesting
|
||||||
|
public void stopDecommission(DatanodeDescriptor node) {
|
||||||
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
||||||
LOG.info("Stopping decommissioning of node {}", node);
|
|
||||||
// Update DN stats maintained by HeartbeatManager
|
// Update DN stats maintained by HeartbeatManager
|
||||||
hbManager.stopDecommission(node);
|
hbManager.stopDecommission(node);
|
||||||
// Over-replicated blocks will be detected and processed when
|
// Over-replicated blocks will be detected and processed when
|
||||||
|
@ -235,8 +233,8 @@ public class DecommissionManager {
|
||||||
pendingNodes.remove(node);
|
pendingNodes.remove(node);
|
||||||
decomNodeBlocks.remove(node);
|
decomNodeBlocks.remove(node);
|
||||||
} else {
|
} else {
|
||||||
LOG.trace("stopDecommission: Node {} is not decommission in progress " +
|
LOG.trace("stopDecommission: Node {} in {}, nothing to do." +
|
||||||
"or decommissioned, nothing to do.", node);
|
node, node.getAdminState());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||||
import org.apache.hadoop.hdfs.DFSUtil;
|
import org.apache.hadoop.hdfs.DFSUtil;
|
||||||
|
@ -31,6 +29,8 @@ import org.apache.hadoop.hdfs.server.protocol.StorageReport;
|
||||||
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
|
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
|
||||||
import org.apache.hadoop.util.Daemon;
|
import org.apache.hadoop.util.Daemon;
|
||||||
import org.apache.hadoop.util.Time;
|
import org.apache.hadoop.util.Time;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Manage the heartbeats received from datanodes.
|
* Manage the heartbeats received from datanodes.
|
||||||
|
@ -38,7 +38,7 @@ import org.apache.hadoop.util.Time;
|
||||||
* by the heartbeat manager lock.
|
* by the heartbeat manager lock.
|
||||||
*/
|
*/
|
||||||
class HeartbeatManager implements DatanodeStatistics {
|
class HeartbeatManager implements DatanodeStatistics {
|
||||||
static final Log LOG = LogFactory.getLog(HeartbeatManager.class);
|
static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Stores a subset of the datanodeMap in DatanodeManager,
|
* Stores a subset of the datanodeMap in DatanodeManager,
|
||||||
|
@ -227,16 +227,27 @@ class HeartbeatManager implements DatanodeStatistics {
|
||||||
}
|
}
|
||||||
|
|
||||||
synchronized void startDecommission(final DatanodeDescriptor node) {
|
synchronized void startDecommission(final DatanodeDescriptor node) {
|
||||||
|
if (!node.isAlive) {
|
||||||
|
LOG.info("Dead node {} is decommissioned immediately.", node);
|
||||||
|
node.setDecommissioned();
|
||||||
|
} else {
|
||||||
stats.subtract(node);
|
stats.subtract(node);
|
||||||
node.startDecommission();
|
node.startDecommission();
|
||||||
stats.add(node);
|
stats.add(node);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
synchronized void stopDecommission(final DatanodeDescriptor node) {
|
synchronized void stopDecommission(final DatanodeDescriptor node) {
|
||||||
|
LOG.info("Stopping decommissioning of {} node {}",
|
||||||
|
node.isAlive ? "live" : "dead", node);
|
||||||
|
if (!node.isAlive) {
|
||||||
|
node.stopDecommission();
|
||||||
|
} else {
|
||||||
stats.subtract(node);
|
stats.subtract(node);
|
||||||
node.stopDecommission();
|
node.stopDecommission();
|
||||||
stats.add(node);
|
stats.add(node);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if there are any expired heartbeats, and if so,
|
* Check if there are any expired heartbeats, and if so,
|
||||||
|
|
|
@ -202,9 +202,14 @@ public class TestNamenodeCapacityReport {
|
||||||
dn.shutdown();
|
dn.shutdown();
|
||||||
DFSTestUtil.setDatanodeDead(dnd);
|
DFSTestUtil.setDatanodeDead(dnd);
|
||||||
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
|
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
|
||||||
|
//Verify decommission of dead node won't impact nodesInService metrics.
|
||||||
|
dnm.getDecomManager().startDecommission(dnd);
|
||||||
expectedInServiceNodes--;
|
expectedInServiceNodes--;
|
||||||
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
|
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
|
||||||
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
|
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
|
||||||
|
//Verify recommission of dead node won't impact nodesInService metrics.
|
||||||
|
dnm.getDecomManager().stopDecommission(dnd);
|
||||||
|
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
|
||||||
}
|
}
|
||||||
|
|
||||||
// restart the nodes to verify that counts are correct after
|
// restart the nodes to verify that counts are correct after
|
||||||
|
|
Loading…
Reference in New Issue