HDFS-3245. Add metrics and web UI for cluster version summary. Contributed by Ravi Prakash.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1517942 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kihwal Lee 2013-08-27 19:25:29 +00:00
parent ba89f8efc7
commit 7246828377
8 changed files with 155 additions and 12 deletions

View File

@ -107,6 +107,9 @@ Release 2.1.1-beta - UNRELEASED
HDFS-5045. Add more unit tests for retry cache to cover all AtMostOnce HDFS-5045. Add more unit tests for retry cache to cover all AtMostOnce
methods. (jing9) methods. (jing9)
HDFS-3245. Add metrics and web UI for cluster version summary. (Ravi
Prakash via kihwal)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES

View File

@ -47,6 +47,7 @@ public class DatanodeInfo extends DatanodeID implements Node {
private long lastUpdate; private long lastUpdate;
private int xceiverCount; private int xceiverCount;
private String location = NetworkTopology.DEFAULT_RACK; private String location = NetworkTopology.DEFAULT_RACK;
private String softwareVersion;
// Datanode administrative states // Datanode administrative states
public enum AdminStates { public enum AdminStates {
@ -381,4 +382,12 @@ public class DatanodeInfo extends DatanodeID implements Node {
// by DatanodeID // by DatanodeID
return (this == obj) || super.equals(obj); return (this == obj) || super.equals(obj);
} }
public String getSoftwareVersion() {
return softwareVersion;
}
public void setSoftwareVersion(String softwareVersion) {
this.softwareVersion = softwareVersion;
}
} }

View File

@ -26,6 +26,7 @@ import java.net.UnknownHostException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.NavigableMap; import java.util.NavigableMap;
@ -55,13 +56,6 @@ import org.apache.hadoop.hdfs.server.namenode.HostFileManager.MutableEntrySet;
import org.apache.hadoop.hdfs.server.namenode.NameNode; import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand; import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager.Entry;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager.EntrySet;
import org.apache.hadoop.hdfs.server.namenode.HostFileManager.MutableEntrySet;
import org.apache.hadoop.hdfs.server.namenode.NameNode;
import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.BalancerBandwidthCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockCommand; import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
@ -71,7 +65,6 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException; import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.RegisterCommand; import org.apache.hadoop.hdfs.server.protocol.RegisterCommand;
import org.apache.hadoop.hdfs.util.CyclicIteration; import org.apache.hadoop.hdfs.util.CyclicIteration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.Server; import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.net.CachedDNSToSwitchMapping; import org.apache.hadoop.net.CachedDNSToSwitchMapping;
import org.apache.hadoop.net.DNSToSwitchMapping; import org.apache.hadoop.net.DNSToSwitchMapping;
@ -173,6 +166,14 @@ public class DatanodeManager {
*/ */
private boolean hasClusterEverBeenMultiRack = false; private boolean hasClusterEverBeenMultiRack = false;
/**
* The number of datanodes for each software version. This list should change
* during rolling upgrades.
* Software version -> Number of datanodes with this version
*/
private HashMap<String, Integer> datanodesSoftwareVersions =
new HashMap<String, Integer>(4, 0.75f);
DatanodeManager(final BlockManager blockManager, final Namesystem namesystem, DatanodeManager(final BlockManager blockManager, final Namesystem namesystem,
final Configuration conf) throws IOException { final Configuration conf) throws IOException {
this.namesystem = namesystem; this.namesystem = namesystem;
@ -463,6 +464,7 @@ public class DatanodeManager {
heartbeatManager.removeDatanode(nodeInfo); heartbeatManager.removeDatanode(nodeInfo);
blockManager.removeBlocksAssociatedTo(nodeInfo); blockManager.removeBlocksAssociatedTo(nodeInfo);
networktopology.remove(nodeInfo); networktopology.remove(nodeInfo);
decrementVersionCount(nodeInfo.getSoftwareVersion());
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("remove datanode " + nodeInfo); LOG.debug("remove datanode " + nodeInfo);
@ -545,6 +547,61 @@ public class DatanodeManager {
} }
} }
private void incrementVersionCount(String version) {
if (version == null) {
return;
}
synchronized(datanodeMap) {
Integer count = this.datanodesSoftwareVersions.get(version);
count = count == null ? 1 : count + 1;
this.datanodesSoftwareVersions.put(version, count);
}
}
private void decrementVersionCount(String version) {
if (version == null) {
return;
}
synchronized(datanodeMap) {
Integer count = this.datanodesSoftwareVersions.get(version);
if(count != null) {
if(count > 1) {
this.datanodesSoftwareVersions.put(version, count-1);
} else {
this.datanodesSoftwareVersions.remove(version);
}
}
}
}
private boolean shouldCountVersion(DatanodeDescriptor node) {
return node.getSoftwareVersion() != null && node.isAlive &&
!isDatanodeDead(node);
}
private void countSoftwareVersions() {
synchronized(datanodeMap) {
HashMap<String, Integer> versionCount = new HashMap<String, Integer>();
for(DatanodeDescriptor dn: datanodeMap.values()) {
// Check isAlive too because right after removeDatanode(),
// isDatanodeDead() is still true
if(shouldCountVersion(dn))
{
Integer num = versionCount.get(dn.getSoftwareVersion());
num = num == null ? 1 : num+1;
versionCount.put(dn.getSoftwareVersion(), num);
}
}
this.datanodesSoftwareVersions = versionCount;
}
}
public HashMap<String, Integer> getDatanodesSoftwareVersions() {
synchronized(datanodeMap) {
return new HashMap<String, Integer> (this.datanodesSoftwareVersions);
}
}
/* Resolve a node's network location */ /* Resolve a node's network location */
private String resolveNetworkLocation (DatanodeID node) { private String resolveNetworkLocation (DatanodeID node) {
List<String> names = new ArrayList<String>(1); List<String> names = new ArrayList<String>(1);
@ -761,7 +818,12 @@ public class DatanodeManager {
try { try {
// update cluster map // update cluster map
getNetworkTopology().remove(nodeS); getNetworkTopology().remove(nodeS);
if(shouldCountVersion(nodeS)) {
decrementVersionCount(nodeS.getSoftwareVersion());
}
nodeS.updateRegInfo(nodeReg); nodeS.updateRegInfo(nodeReg);
nodeS.setSoftwareVersion(nodeReg.getSoftwareVersion());
nodeS.setDisallowed(false); // Node is in the include list nodeS.setDisallowed(false); // Node is in the include list
// resolve network location // resolve network location
@ -770,12 +832,14 @@ public class DatanodeManager {
// also treat the registration message as a heartbeat // also treat the registration message as a heartbeat
heartbeatManager.register(nodeS); heartbeatManager.register(nodeS);
incrementVersionCount(nodeS.getSoftwareVersion());
checkDecommissioning(nodeS); checkDecommissioning(nodeS);
success = true; success = true;
} finally { } finally {
if (!success) { if (!success) {
removeDatanode(nodeS); removeDatanode(nodeS);
wipeDatanode(nodeS); wipeDatanode(nodeS);
countSoftwareVersions();
} }
} }
return; return;
@ -799,6 +863,7 @@ public class DatanodeManager {
try { try {
nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr)); nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr));
networktopology.add(nodeDescr); networktopology.add(nodeDescr);
nodeDescr.setSoftwareVersion(nodeReg.getSoftwareVersion());
// register new datanode // register new datanode
addDatanode(nodeDescr); addDatanode(nodeDescr);
@ -809,10 +874,12 @@ public class DatanodeManager {
// because its is done when the descriptor is created // because its is done when the descriptor is created
heartbeatManager.addDatanode(nodeDescr); heartbeatManager.addDatanode(nodeDescr);
success = true; success = true;
incrementVersionCount(nodeReg.getSoftwareVersion());
} finally { } finally {
if (!success) { if (!success) {
removeDatanode(nodeDescr); removeDatanode(nodeDescr);
wipeDatanode(nodeDescr); wipeDatanode(nodeDescr);
countSoftwareVersions();
} }
} }
} catch (InvalidTopologyException e) { } catch (InvalidTopologyException e) {
@ -834,6 +901,7 @@ public class DatanodeManager {
namesystem.writeLock(); namesystem.writeLock();
try { try {
refreshDatanodes(); refreshDatanodes();
countSoftwareVersions();
} finally { } finally {
namesystem.writeUnlock(); namesystem.writeUnlock();
} }

View File

@ -359,6 +359,7 @@ class ClusterJspHelper {
nn.httpAddress = httpAddress; nn.httpAddress = httpAddress;
getLiveNodeCount(getProperty(props, "LiveNodes").getValueAsText(), nn); getLiveNodeCount(getProperty(props, "LiveNodes").getValueAsText(), nn);
getDeadNodeCount(getProperty(props, "DeadNodes").getValueAsText(), nn); getDeadNodeCount(getProperty(props, "DeadNodes").getValueAsText(), nn);
nn.softwareVersion = getProperty(props, "SoftwareVersion").getTextValue();
return nn; return nn;
} }
@ -596,6 +597,7 @@ class ClusterJspHelper {
toXmlItemBlockWithLink(doc, nn.deadDatanodeCount + " (" + toXmlItemBlockWithLink(doc, nn.deadDatanodeCount + " (" +
nn.deadDecomCount + ")", nn.httpAddress+"/dfsnodelist.jsp?whatNodes=DEAD" nn.deadDecomCount + ")", nn.httpAddress+"/dfsnodelist.jsp?whatNodes=DEAD"
, "Dead Datanode (Decommissioned)"); , "Dead Datanode (Decommissioned)");
toXmlItemBlock(doc, "Software Version", nn.softwareVersion);
doc.endTag(); // node doc.endTag(); // node
} }
doc.endTag(); // namenodes doc.endTag(); // namenodes
@ -624,6 +626,7 @@ class ClusterJspHelper {
int deadDatanodeCount = 0; int deadDatanodeCount = 0;
int deadDecomCount = 0; int deadDecomCount = 0;
String httpAddress = null; String httpAddress = null;
String softwareVersion = "";
} }
/** /**

View File

@ -6208,6 +6208,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed()); innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed());
innerinfo.put("capacity", node.getCapacity()); innerinfo.put("capacity", node.getCapacity());
innerinfo.put("numBlocks", node.numBlocks()); innerinfo.put("numBlocks", node.numBlocks());
innerinfo.put("version", node.getSoftwareVersion());
info.put(node.getHostName(), innerinfo); info.put(node.getHostName(), innerinfo);
} }
return JSON.toString(info); return JSON.toString(info);
@ -6419,6 +6420,22 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
return JSON.toString(list); return JSON.toString(list);
} }
@Override //NameNodeMXBean
public int getDistinctVersionCount() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
.size();
}
@Override //NameNodeMXBean
public Map<String, Integer> getDistinctVersions() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
}
@Override //NameNodeMXBean
public String getSoftwareVersion() {
return VersionInfo.getVersion();
}
/** /**
* Verifies that the given identifier and password are valid and match. * Verifies that the given identifier and password are valid and match.
* @param identifier Token identifier. * @param identifier Token identifier.

View File

@ -17,6 +17,8 @@
*/ */
package org.apache.hadoop.hdfs.server.namenode; package org.apache.hadoop.hdfs.server.namenode;
import java.util.Map;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
@ -34,6 +36,12 @@ public interface NameNodeMXBean {
*/ */
public String getVersion(); public String getVersion();
/**
* Get the version of software running on the Namenode
* @return a string representing the version
*/
public String getSoftwareVersion();
/** /**
* Gets the used space by data nodes. * Gets the used space by data nodes.
* *
@ -215,4 +223,19 @@ public interface NameNodeMXBean {
* @return the list of corrupt files, as a JSON string. * @return the list of corrupt files, as a JSON string.
*/ */
public String getCorruptFiles(); public String getCorruptFiles();
/**
* Get the number of distinct versions of live datanodes
*
* @return the number of distinct versions of live datanodes
*/
public int getDistinctVersionCount();
/**
* Get the number of live datanodes for each distinct versions
*
* @return the number of live datanodes for each distinct versions
*/
public Map<String, Integer> getDistinctVersions();
} }

View File

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map;
import javax.servlet.ServletContext; import javax.servlet.ServletContext;
import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequest;
@ -99,6 +100,20 @@ class NamenodeJspHelper {
} }
} }
static String getRollingUpgradeText(FSNamesystem fsn) {
DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
Map<String, Integer> list = dm.getDatanodesSoftwareVersions();
if(list.size() > 1) {
StringBuffer status = new StringBuffer("Rolling upgrades in progress. " +
"There are " + list.size() + " versions of datanodes currently live: ");
for(Map.Entry<String, Integer> ver: list.entrySet()) {
status.append(ver.getKey() + "(" + ver.getValue() + "), ");
}
return status.substring(0, status.length()-2);
}
return "";
}
static String getInodeLimitText(FSNamesystem fsn) { static String getInodeLimitText(FSNamesystem fsn) {
if (fsn == null) { if (fsn == null) {
return ""; return "";
@ -802,7 +817,9 @@ class NamenodeJspHelper {
+ "<td align=\"right\" class=\"pcbpused\">" + "<td align=\"right\" class=\"pcbpused\">"
+ percentBpUsed + percentBpUsed
+ "<td align=\"right\" class=\"volfails\">" + "<td align=\"right\" class=\"volfails\">"
+ d.getVolumeFailures() + "\n"); + d.getVolumeFailures()
+ "<td align=\"right\" class=\"version\">"
+ d.getSoftwareVersion() + "\n");
} }
void generateNodesList(ServletContext context, JspWriter out, void generateNodesList(ServletContext context, JspWriter out,
@ -900,7 +917,9 @@ class NamenodeJspHelper {
+ nodeHeaderStr("pcbpused") + nodeHeaderStr("pcbpused")
+ "> Block Pool<br>Used (%)" + " <th " + "> Block Pool<br>Used (%)" + " <th "
+ nodeHeaderStr("volfails") + nodeHeaderStr("volfails")
+"> Failed Volumes\n"); +"> Failed Volumes <th "
+ nodeHeaderStr("versionString")
+"> Version\n");
JspHelper.sortNodeList(live, sorterField, sorterOrder); JspHelper.sortNodeList(live, sorterField, sorterOrder);
for (int i = 0; i < live.size(); i++) { for (int i = 0; i < live.size(); i++) {

View File

@ -65,6 +65,7 @@
<h3>Cluster Summary</h3> <h3>Cluster Summary</h3>
<b> <%= NamenodeJspHelper.getSecurityModeText()%> </b> <b> <%= NamenodeJspHelper.getSecurityModeText()%> </b>
<b> <%= NamenodeJspHelper.getSafeModeText(fsn)%> </b> <b> <%= NamenodeJspHelper.getSafeModeText(fsn)%> </b>
<b> <%= NamenodeJspHelper.getRollingUpgradeText(fsn)%> </b>
<b> <%= NamenodeJspHelper.getInodeLimitText(fsn)%> </b> <b> <%= NamenodeJspHelper.getInodeLimitText(fsn)%> </b>
<%= NamenodeJspHelper.getCorruptFilesWarning(fsn)%> <%= NamenodeJspHelper.getCorruptFilesWarning(fsn)%>