HDFS-7725. Incorrect 'nodes in service' metrics caused all writes to fail. Contributed by Ming Ma.
(cherry picked from commit 8104d52269
)
This commit is contained in:
parent
653ef52ef2
commit
1d23e1ec07
|
@ -28,6 +28,9 @@ Release 2.7.2 - UNRELEASED
|
|||
HDFS-6945. BlockManager should remove a block from excessReplicateMap and
|
||||
decrement ExcessBlocks metric when the block is removed. (aajisaka)
|
||||
|
||||
HDFS-7725. Incorrect "nodes in service" metrics caused all writes to fail.
|
||||
(Ming Ma via wang)
|
||||
|
||||
HDFS-8806. Inconsistent metrics: number of missing blocks with replication
|
||||
factor 1 not properly cleared. (Zhe Zhang via aajisaka)
|
||||
|
||||
|
|
|
@ -197,23 +197,21 @@ public class DecommissionManager {
|
|||
*/
|
||||
@VisibleForTesting
|
||||
public void startDecommission(DatanodeDescriptor node) {
|
||||
if (!node.isDecommissionInProgress()) {
|
||||
if (!node.isAlive) {
|
||||
LOG.info("Dead node {} is decommissioned immediately.", node);
|
||||
node.setDecommissioned();
|
||||
} else if (!node.isDecommissioned()) {
|
||||
if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
|
||||
// Update DN stats maintained by HeartbeatManager
|
||||
hbManager.startDecommission(node);
|
||||
// hbManager.startDecommission will set dead node to decommissioned.
|
||||
if (node.isDecommissionInProgress()) {
|
||||
for (DatanodeStorageInfo storage : node.getStorageInfos()) {
|
||||
LOG.info("Starting decommission of {} {} with {} blocks",
|
||||
LOG.info("Starting decommission of {} {} with {} blocks",
|
||||
node, storage, storage.numBlocks());
|
||||
}
|
||||
// Update DN stats maintained by HeartbeatManager
|
||||
hbManager.startDecommission(node);
|
||||
node.decommissioningStatus.setStartTime(monotonicNow());
|
||||
pendingNodes.add(node);
|
||||
}
|
||||
} else {
|
||||
LOG.trace("startDecommission: Node {} is already decommission in "
|
||||
+ "progress, nothing to do.", node);
|
||||
LOG.trace("startDecommission: Node {} in {}, nothing to do." +
|
||||
node, node.getAdminState());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -221,12 +219,12 @@ public class DecommissionManager {
|
|||
* Stop decommissioning the specified datanode.
|
||||
* @param node
|
||||
*/
|
||||
void stopDecommission(DatanodeDescriptor node) {
|
||||
@VisibleForTesting
|
||||
public void stopDecommission(DatanodeDescriptor node) {
|
||||
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
|
||||
LOG.info("Stopping decommissioning of node {}", node);
|
||||
// Update DN stats maintained by HeartbeatManager
|
||||
hbManager.stopDecommission(node);
|
||||
// Over-replicated blocks will be detected and processed when
|
||||
// Over-replicated blocks will be detected and processed when
|
||||
// the dead node comes back and send in its full block report.
|
||||
if (node.isAlive) {
|
||||
blockManager.processOverReplicatedBlocksOnReCommission(node);
|
||||
|
@ -235,8 +233,8 @@ public class DecommissionManager {
|
|||
pendingNodes.remove(node);
|
||||
decomNodeBlocks.remove(node);
|
||||
} else {
|
||||
LOG.trace("stopDecommission: Node {} is not decommission in progress " +
|
||||
"or decommissioned, nothing to do.", node);
|
||||
LOG.trace("stopDecommission: Node {} in {}, nothing to do." +
|
||||
node, node.getAdminState());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.hadoop.hdfs.server.blockmanagement;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.DFSUtil;
|
||||
|
@ -31,6 +29,8 @@ import org.apache.hadoop.hdfs.server.protocol.StorageReport;
|
|||
import org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary;
|
||||
import org.apache.hadoop.util.Daemon;
|
||||
import org.apache.hadoop.util.Time;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Manage the heartbeats received from datanodes.
|
||||
|
@ -38,7 +38,7 @@ import org.apache.hadoop.util.Time;
|
|||
* by the heartbeat manager lock.
|
||||
*/
|
||||
class HeartbeatManager implements DatanodeStatistics {
|
||||
static final Log LOG = LogFactory.getLog(HeartbeatManager.class);
|
||||
static final Logger LOG = LoggerFactory.getLogger(HeartbeatManager.class);
|
||||
|
||||
/**
|
||||
* Stores a subset of the datanodeMap in DatanodeManager,
|
||||
|
@ -227,15 +227,26 @@ class HeartbeatManager implements DatanodeStatistics {
|
|||
}
|
||||
|
||||
synchronized void startDecommission(final DatanodeDescriptor node) {
|
||||
stats.subtract(node);
|
||||
node.startDecommission();
|
||||
stats.add(node);
|
||||
if (!node.isAlive) {
|
||||
LOG.info("Dead node {} is decommissioned immediately.", node);
|
||||
node.setDecommissioned();
|
||||
} else {
|
||||
stats.subtract(node);
|
||||
node.startDecommission();
|
||||
stats.add(node);
|
||||
}
|
||||
}
|
||||
|
||||
synchronized void stopDecommission(final DatanodeDescriptor node) {
|
||||
stats.subtract(node);
|
||||
node.stopDecommission();
|
||||
stats.add(node);
|
||||
LOG.info("Stopping decommissioning of {} node {}",
|
||||
node.isAlive ? "live" : "dead", node);
|
||||
if (!node.isAlive) {
|
||||
node.stopDecommission();
|
||||
} else {
|
||||
stats.subtract(node);
|
||||
node.stopDecommission();
|
||||
stats.add(node);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -202,9 +202,14 @@ public class TestNamenodeCapacityReport {
|
|||
dn.shutdown();
|
||||
DFSTestUtil.setDatanodeDead(dnd);
|
||||
BlockManagerTestUtil.checkHeartbeat(namesystem.getBlockManager());
|
||||
//Verify decommission of dead node won't impact nodesInService metrics.
|
||||
dnm.getDecomManager().startDecommission(dnd);
|
||||
expectedInServiceNodes--;
|
||||
assertEquals(expectedInServiceNodes, namesystem.getNumLiveDataNodes());
|
||||
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
|
||||
//Verify recommission of dead node won't impact nodesInService metrics.
|
||||
dnm.getDecomManager().stopDecommission(dnd);
|
||||
assertEquals(expectedInServiceNodes, getNumDNInService(namesystem));
|
||||
}
|
||||
|
||||
// restart the nodes to verify that counts are correct after
|
||||
|
|
Loading…
Reference in New Issue