HDFS-3245. Add metrics and web UI for cluster version summary. Contributed by Ravi Prakash.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1517942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kihwal Lee 2013-08-27 19:25:29 +00:00
parent ba89f8efc7
commit 7246828377
8 changed files with 155 additions and 12 deletions

View File

@ -107,6 +107,9 @@ Release 2.1.1-beta - UNRELEASED
HDFS-5045. Add more unit tests for retry cache to cover all AtMostOnce
methods. (jing9)
HDFS-3245. Add metrics and web UI for cluster version summary. (Ravi
Prakash via kihwal)
OPTIMIZATIONS
BUG FIXES

View File

@ -47,6 +47,7 @@ public class DatanodeInfo extends DatanodeID implements Node {
private long lastUpdate;
private int xceiverCount;
private String location = NetworkTopology.DEFAULT_RACK;
private String softwareVersion;
// Datanode administrative states
public enum AdminStates {
@ -381,4 +382,12 @@ public class DatanodeInfo extends DatanodeID implements Node {
// by DatanodeID
return (this == obj) || super.equals(obj);
}
public String getSoftwareVersion() {
return softwareVersion;
}
public void setSoftwareVersion(String softwareVersion) {
this.softwareVersion = softwareVersion;
}
}

View File

@ -26,6 +26,7 @@ import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableMap;
@ -55,13 +56,6 @@ import org.apache.hadoop.hdfs.server.namenode.HostFileManager.MutableEntrySet;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager.Entry;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager.EntrySet;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager.MutableEntrySet;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
@ -71,7 +65,6 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.RegisterCommand;
import org.apache.hadoop.hdfs.util.CyclicIteration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.net.CachedDNSToSwitchMapping;
import org.apache.hadoop.net.DNSToSwitchMapping;
@ -173,6 +166,14 @@ public class DatanodeManager {
*/
private boolean hasClusterEverBeenMultiRack = false;
/**
* The number of datanodes for each software version. This list should change
* during rolling upgrades.
* Software version -> Number of datanodes with this version
*/
private HashMap<String, Integer> datanodesSoftwareVersions =
new HashMap<String, Integer>(4, 0.75f);
DatanodeManager(final BlockManager blockManager, final Namesystem namesystem,
final Configuration conf) throws IOException {
this.namesystem = namesystem;
@ -463,6 +464,7 @@ public class DatanodeManager {
heartbeatManager.removeDatanode(nodeInfo);
blockManager.removeBlocksAssociatedTo(nodeInfo);
networktopology.remove(nodeInfo);
decrementVersionCount(nodeInfo.getSoftwareVersion());
if (LOG.isDebugEnabled()) {
LOG.debug("remove datanode " + nodeInfo);
@ -545,6 +547,61 @@ public class DatanodeManager {
}
}
private void incrementVersionCount(String version) {
if (version == null) {
return;
}
synchronized(datanodeMap) {
Integer count = this.datanodesSoftwareVersions.get(version);
count = count == null ? 1 : count + 1;
this.datanodesSoftwareVersions.put(version, count);
}
}
private void decrementVersionCount(String version) {
if (version == null) {
return;
}
synchronized(datanodeMap) {
Integer count = this.datanodesSoftwareVersions.get(version);
if(count != null) {
if(count > 1) {
this.datanodesSoftwareVersions.put(version, count-1);
} else {
this.datanodesSoftwareVersions.remove(version);
}
}
}
}
private boolean shouldCountVersion(DatanodeDescriptor node) {
return node.getSoftwareVersion() != null && node.isAlive &&
!isDatanodeDead(node);
}
private void countSoftwareVersions() {
synchronized(datanodeMap) {
HashMap<String, Integer> versionCount = new HashMap<String, Integer>();
for(DatanodeDescriptor dn: datanodeMap.values()) {
// Check isAlive too because right after removeDatanode(),
// isDatanodeDead() is still true
if(shouldCountVersion(dn))
{
Integer num = versionCount.get(dn.getSoftwareVersion());
num = num == null ? 1 : num+1;
versionCount.put(dn.getSoftwareVersion(), num);
}
}
this.datanodesSoftwareVersions = versionCount;
}
}
public HashMap<String, Integer> getDatanodesSoftwareVersions() {
synchronized(datanodeMap) {
return new HashMap<String, Integer> (this.datanodesSoftwareVersions);
}
}
/* Resolve a node's network location */
private String resolveNetworkLocation (DatanodeID node) {
List<String> names = new ArrayList<String>(1);
@ -761,7 +818,12 @@ public class DatanodeManager {
try {
// update cluster map
getNetworkTopology().remove(nodeS);
if(shouldCountVersion(nodeS)) {
decrementVersionCount(nodeS.getSoftwareVersion());
}
nodeS.updateRegInfo(nodeReg);
nodeS.setSoftwareVersion(nodeReg.getSoftwareVersion());
nodeS.setDisallowed(false); // Node is in the include list
// resolve network location
@ -770,12 +832,14 @@ public class DatanodeManager {
// also treat the registration message as a heartbeat
heartbeatManager.register(nodeS);
incrementVersionCount(nodeS.getSoftwareVersion());
checkDecommissioning(nodeS);
success = true;
} finally {
if (!success) {
removeDatanode(nodeS);
wipeDatanode(nodeS);
countSoftwareVersions();
}
}
return;
@ -799,6 +863,7 @@ public class DatanodeManager {
try {
nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr));
networktopology.add(nodeDescr);
nodeDescr.setSoftwareVersion(nodeReg.getSoftwareVersion());
// register new datanode
addDatanode(nodeDescr);
@ -809,10 +874,12 @@ public class DatanodeManager {
// because its is done when the descriptor is created
heartbeatManager.addDatanode(nodeDescr);
success = true;
incrementVersionCount(nodeReg.getSoftwareVersion());
} finally {
if (!success) {
removeDatanode(nodeDescr);
wipeDatanode(nodeDescr);
countSoftwareVersions();
}
}
} catch (InvalidTopologyException e) {
@ -834,6 +901,7 @@ public class DatanodeManager {
namesystem.writeLock();
try {
refreshDatanodes();
countSoftwareVersions();
} finally {
namesystem.writeUnlock();
}

View File

@ -359,6 +359,7 @@ class ClusterJspHelper {
nn.httpAddress = httpAddress;
getLiveNodeCount(getProperty(props, "LiveNodes").getValueAsText(), nn);
getDeadNodeCount(getProperty(props, "DeadNodes").getValueAsText(), nn);
nn.softwareVersion = getProperty(props, "SoftwareVersion").getTextValue();
return nn;
}
@ -596,6 +597,7 @@ class ClusterJspHelper {
toXmlItemBlockWithLink(doc, nn.deadDatanodeCount + " (" +
nn.deadDecomCount + ")", nn.httpAddress+"/dfsnodelist.jsp?whatNodes=DEAD"
, "Dead Datanode (Decommissioned)");
toXmlItemBlock(doc, "Software Version", nn.softwareVersion);
doc.endTag(); // node
}
doc.endTag(); // namenodes
@ -624,6 +626,7 @@ class ClusterJspHelper {
int deadDatanodeCount = 0;
int deadDecomCount = 0;
String httpAddress = null;
String softwareVersion = "";
}
/**

View File

@ -6208,6 +6208,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed());
innerinfo.put("capacity", node.getCapacity());
innerinfo.put("numBlocks", node.numBlocks());
innerinfo.put("version", node.getSoftwareVersion());
info.put(node.getHostName(), innerinfo);
}
return JSON.toString(info);
@ -6419,6 +6420,22 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
return JSON.toString(list);
}
@Override //NameNodeMXBean
public int getDistinctVersionCount() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
.size();
}
@Override //NameNodeMXBean
public Map<String, Integer> getDistinctVersions() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
}
@Override //NameNodeMXBean
public String getSoftwareVersion() {
return VersionInfo.getVersion();
}
/**
* Verifies that the given identifier and password are valid and match.
* @param identifier Token identifier.

View File

@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.util.Map;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
@ -34,6 +36,12 @@ public interface NameNodeMXBean {
*/
public String getVersion();
/**
* Get the version of software running on the Namenode
* @return a string representing the version
*/
public String getSoftwareVersion();
/**
* Gets the used space by data nodes.
*
@ -215,4 +223,19 @@ public interface NameNodeMXBean {
* @return the list of corrupt files, as a JSON string.
*/
public String getCorruptFiles();
/**
* Get the number of distinct versions of live datanodes
*
* @return the number of distinct versions of live datanodes
*/
public int getDistinctVersionCount();
/**
* Get the number of live datanodes for each distinct versions
*
* @return the number of live datanodes for each distinct versions
*/
public Map<String, Integer> getDistinctVersions();
}

View File

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.servlet.ServletContext;
import javax.servlet.http.HttpServletRequest;
@ -99,6 +100,20 @@ class NamenodeJspHelper {
}
}
static String getRollingUpgradeText(FSNamesystem fsn) {
DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
Map<String, Integer> list = dm.getDatanodesSoftwareVersions();
if(list.size() > 1) {
StringBuffer status = new StringBuffer("Rolling upgrades in progress. " +
"There are " + list.size() + " versions of datanodes currently live: ");
for(Map.Entry<String, Integer> ver: list.entrySet()) {
status.append(ver.getKey() + "(" + ver.getValue() + "), ");
}
return status.substring(0, status.length()-2);
}
return "";
}
static String getInodeLimitText(FSNamesystem fsn) {
if (fsn == null) {
return "";
@ -802,7 +817,9 @@ class NamenodeJspHelper {
+ "<td align=\"right\" class=\"pcbpused\">"
+ percentBpUsed
+ "<td align=\"right\" class=\"volfails\">"
+ d.getVolumeFailures() + "\n");
+ d.getVolumeFailures()
+ "<td align=\"right\" class=\"version\">"
+ d.getSoftwareVersion() + "\n");
}
void generateNodesList(ServletContext context, JspWriter out,
@ -900,7 +917,9 @@ class NamenodeJspHelper {
+ nodeHeaderStr("pcbpused")
+ "> Block Pool<br>Used (%)" + " <th "
+ nodeHeaderStr("volfails")
+"> Failed Volumes\n");
+"> Failed Volumes <th "
+ nodeHeaderStr("versionString")
+"> Version\n");
JspHelper.sortNodeList(live, sorterField, sorterOrder);
for (int i = 0; i < live.size(); i++) {

View File

@ -65,6 +65,7 @@
<h3>Cluster Summary</h3>
<b> <%= NamenodeJspHelper.getSecurityModeText()%> </b>
<b> <%= NamenodeJspHelper.getSafeModeText(fsn)%> </b>
<b> <%= NamenodeJspHelper.getRollingUpgradeText(fsn)%> </b>
<b> <%= NamenodeJspHelper.getInodeLimitText(fsn)%> </b>
<%= NamenodeJspHelper.getCorruptFilesWarning(fsn)%>