HDFS-3245. Add metrics and web UI for cluster version summary. Contributed by Ravi Prakash.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1517937 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Kihwal Lee 2013-08-27 19:21:15 +00:00
parent 5adba5597c
commit 39252995c4
8 changed files with 155 additions and 5 deletions

View File

@ -333,6 +333,9 @@ Release 2.1.1-beta - UNRELEASED
HDFS-5045. Add more unit tests for retry cache to cover all AtMostOnce
methods. (jing9)
HDFS-3245. Add metrics and web UI for cluster version summary. (Ravi
Prakash via kihwal)
OPTIMIZATIONS
BUG FIXES

View File

@ -47,6 +47,7 @@ public class DatanodeInfo extends DatanodeID implements Node {
private long lastUpdate;
private int xceiverCount;
private String location = NetworkTopology.DEFAULT_RACK;
private String softwareVersion;
// Datanode administrative states
public enum AdminStates {
@ -383,4 +384,12 @@ public boolean equals(Object obj) {
// by DatanodeID
return (this == obj) || super.equals(obj);
}
public String getSoftwareVersion() {
return softwareVersion;
}
public void setSoftwareVersion(String softwareVersion) {
this.softwareVersion = softwareVersion;
}
}

View File

@ -26,6 +26,7 @@
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.NavigableMap;
@ -64,7 +65,6 @@
import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException;
import org.apache.hadoop.hdfs.server.protocol.RegisterCommand;
import org.apache.hadoop.hdfs.util.CyclicIteration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.net.CachedDNSToSwitchMapping;
import org.apache.hadoop.net.DNSToSwitchMapping;
@ -165,6 +165,14 @@ public class DatanodeManager {
* according to the NetworkTopology.
*/
private boolean hasClusterEverBeenMultiRack = false;
/**
* The number of datanodes for each software version. This list should change
* during rolling upgrades.
* Software version -> Number of datanodes with this version
*/
private HashMap<String, Integer> datanodesSoftwareVersions =
new HashMap<String, Integer>(4, 0.75f);
DatanodeManager(final BlockManager blockManager, final Namesystem namesystem,
final Configuration conf) throws IOException {
@ -456,6 +464,7 @@ private void removeDatanode(DatanodeDescriptor nodeInfo) {
heartbeatManager.removeDatanode(nodeInfo);
blockManager.removeBlocksAssociatedTo(nodeInfo);
networktopology.remove(nodeInfo);
decrementVersionCount(nodeInfo.getSoftwareVersion());
if (LOG.isDebugEnabled()) {
LOG.debug("remove datanode " + nodeInfo);
@ -538,6 +547,61 @@ private void wipeDatanode(final DatanodeID node) {
}
}
private void incrementVersionCount(String version) {
if (version == null) {
return;
}
synchronized(datanodeMap) {
Integer count = this.datanodesSoftwareVersions.get(version);
count = count == null ? 1 : count + 1;
this.datanodesSoftwareVersions.put(version, count);
}
}
private void decrementVersionCount(String version) {
if (version == null) {
return;
}
synchronized(datanodeMap) {
Integer count = this.datanodesSoftwareVersions.get(version);
if(count != null) {
if(count > 1) {
this.datanodesSoftwareVersions.put(version, count-1);
} else {
this.datanodesSoftwareVersions.remove(version);
}
}
}
}
private boolean shouldCountVersion(DatanodeDescriptor node) {
return node.getSoftwareVersion() != null && node.isAlive &&
!isDatanodeDead(node);
}
private void countSoftwareVersions() {
synchronized(datanodeMap) {
HashMap<String, Integer> versionCount = new HashMap<String, Integer>();
for(DatanodeDescriptor dn: datanodeMap.values()) {
// Check isAlive too because right after removeDatanode(),
// isDatanodeDead() is still true
if(shouldCountVersion(dn))
{
Integer num = versionCount.get(dn.getSoftwareVersion());
num = num == null ? 1 : num+1;
versionCount.put(dn.getSoftwareVersion(), num);
}
}
this.datanodesSoftwareVersions = versionCount;
}
}
public HashMap<String, Integer> getDatanodesSoftwareVersions() {
synchronized(datanodeMap) {
return new HashMap<String, Integer> (this.datanodesSoftwareVersions);
}
}
/* Resolve a node's network location */
private String resolveNetworkLocation (DatanodeID node) {
List<String> names = new ArrayList<String>(1);
@ -755,21 +819,28 @@ nodes with its data cleared (or user can just remove the StorageID
try {
// update cluster map
getNetworkTopology().remove(nodeS);
if(shouldCountVersion(nodeS)) {
decrementVersionCount(nodeS.getSoftwareVersion());
}
nodeS.updateRegInfo(nodeReg);
nodeS.setSoftwareVersion(nodeReg.getSoftwareVersion());
nodeS.setDisallowed(false); // Node is in the include list
// resolve network location
nodeS.setNetworkLocation(resolveNetworkLocation(nodeS));
getNetworkTopology().add(nodeS);
// also treat the registration message as a heartbeat
heartbeatManager.register(nodeS);
incrementVersionCount(nodeS.getSoftwareVersion());
checkDecommissioning(nodeS);
success = true;
} finally {
if (!success) {
removeDatanode(nodeS);
wipeDatanode(nodeS);
countSoftwareVersions();
}
}
return;
@ -793,6 +864,7 @@ nodes with its data cleared (or user can just remove the StorageID
try {
nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr));
networktopology.add(nodeDescr);
nodeDescr.setSoftwareVersion(nodeReg.getSoftwareVersion());
// register new datanode
addDatanode(nodeDescr);
@ -803,10 +875,12 @@ nodes with its data cleared (or user can just remove the StorageID
// because its is done when the descriptor is created
heartbeatManager.addDatanode(nodeDescr);
success = true;
incrementVersionCount(nodeReg.getSoftwareVersion());
} finally {
if (!success) {
removeDatanode(nodeDescr);
wipeDatanode(nodeDescr);
countSoftwareVersions();
}
}
} catch (InvalidTopologyException e) {
@ -828,6 +902,7 @@ public void refreshNodes(final Configuration conf) throws IOException {
namesystem.writeLock();
try {
refreshDatanodes();
countSoftwareVersions();
} finally {
namesystem.writeUnlock();
}

View File

@ -359,6 +359,7 @@ public NamenodeStatus getNamenodeStatus(String props) throws IOException,
nn.httpAddress = httpAddress;
getLiveNodeCount(getProperty(props, "LiveNodes").getValueAsText(), nn);
getDeadNodeCount(getProperty(props, "DeadNodes").getValueAsText(), nn);
nn.softwareVersion = getProperty(props, "SoftwareVersion").getTextValue();
return nn;
}
@ -596,6 +597,7 @@ public void toXML(XMLOutputter doc) throws IOException {
toXmlItemBlockWithLink(doc, nn.deadDatanodeCount + " (" +
nn.deadDecomCount + ")", nn.httpAddress+"/dfsnodelist.jsp?whatNodes=DEAD"
, "Dead Datanode (Decommissioned)");
toXmlItemBlock(doc, "Software Version", nn.softwareVersion);
doc.endTag(); // node
}
doc.endTag(); // namenodes
@ -624,6 +626,7 @@ static class NamenodeStatus {
int deadDatanodeCount = 0;
int deadDecomCount = 0;
String httpAddress = null;
String softwareVersion = "";
}
/**

View File

@ -6225,6 +6225,7 @@ public String getLiveNodes() {
innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed());
innerinfo.put("capacity", node.getCapacity());
innerinfo.put("numBlocks", node.numBlocks());
innerinfo.put("version", node.getSoftwareVersion());
info.put(node.getHostName(), innerinfo);
}
return JSON.toString(info);
@ -6436,6 +6437,22 @@ public String getCorruptFiles() {
return JSON.toString(list);
}
@Override //NameNodeMXBean
public int getDistinctVersionCount() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions()
.size();
}
@Override //NameNodeMXBean
public Map<String, Integer> getDistinctVersions() {
return blockManager.getDatanodeManager().getDatanodesSoftwareVersions();
}
@Override //NameNodeMXBean
public String getSoftwareVersion() {
return VersionInfo.getVersion();
}
/**
* Verifies that the given identifier and password are valid and match.
* @param identifier Token identifier.

View File

@ -17,6 +17,8 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.util.Map;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
@ -33,7 +35,13 @@ public interface NameNodeMXBean {
* @return the version
*/
public String getVersion();
/**
* Get the version of software running on the Namenode
* @return a string representing the version
*/
public String getSoftwareVersion();
/**
* Gets the used space by data nodes.
*
@ -215,4 +223,19 @@ public interface NameNodeMXBean {
* @return the list of corrupt files, as a JSON string.
*/
public String getCorruptFiles();
/**
* Get the number of distinct versions of live datanodes
*
* @return the number of distinct versions of live datanodes
*/
public int getDistinctVersionCount();
/**
* Get the number of live datanodes for each distinct versions
*
* @return the number of live datanodes for each distinct versions
*/
public Map<String, Integer> getDistinctVersions();
}

View File

@ -32,6 +32,7 @@
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.servlet.ServletContext;
import javax.servlet.http.HttpServletRequest;
@ -99,6 +100,20 @@ static String getSecurityModeText() {
}
}
static String getRollingUpgradeText(FSNamesystem fsn) {
DatanodeManager dm = fsn.getBlockManager().getDatanodeManager();
Map<String, Integer> list = dm.getDatanodesSoftwareVersions();
if(list.size() > 1) {
StringBuffer status = new StringBuffer("Rolling upgrades in progress. " +
"There are " + list.size() + " versions of datanodes currently live: ");
for(Map.Entry<String, Integer> ver: list.entrySet()) {
status.append(ver.getKey() + "(" + ver.getValue() + "), ");
}
return status.substring(0, status.length()-2);
}
return "";
}
static String getInodeLimitText(FSNamesystem fsn) {
if (fsn == null) {
return "";
@ -802,7 +817,9 @@ void generateNodeData(JspWriter out, DatanodeDescriptor d, String suffix,
+ "<td align=\"right\" class=\"pcbpused\">"
+ percentBpUsed
+ "<td align=\"right\" class=\"volfails\">"
+ d.getVolumeFailures() + "\n");
+ d.getVolumeFailures()
+ "<td align=\"right\" class=\"version\">"
+ d.getSoftwareVersion() + "\n");
}
void generateNodesList(ServletContext context, JspWriter out,
@ -900,7 +917,9 @@ void generateNodesList(ServletContext context, JspWriter out,
+ nodeHeaderStr("pcbpused")
+ "> Block Pool<br>Used (%)" + " <th "
+ nodeHeaderStr("volfails")
+"> Failed Volumes\n");
+"> Failed Volumes <th "
+ nodeHeaderStr("versionString")
+"> Version\n");
JspHelper.sortNodeList(live, sorterField, sorterOrder);
for (int i = 0; i < live.size(); i++) {

View File

@ -65,6 +65,7 @@
<h3>Cluster Summary</h3>
<b> <%= NamenodeJspHelper.getSecurityModeText()%> </b>
<b> <%= NamenodeJspHelper.getSafeModeText(fsn)%> </b>
<b> <%= NamenodeJspHelper.getRollingUpgradeText(fsn)%> </b>
<b> <%= NamenodeJspHelper.getInodeLimitText(fsn)%> </b>
<%= NamenodeJspHelper.getCorruptFilesWarning(fsn)%>