From 39252995c4d734e993e3fa5338e1a7816aee86fc Mon Sep 17 00:00:00 2001 From: Kihwal Lee Date: Tue, 27 Aug 2013 19:21:15 +0000 Subject: [PATCH] HDFS-3245. Add metrics and web UI for cluster version summary. Contributed by Ravi Prakash. git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1517937 13f79535-47bb-0310-9956-ffa450edef68 --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 3 + .../hadoop/hdfs/protocol/DatanodeInfo.java | 9 +++ .../blockmanagement/DatanodeManager.java | 79 ++++++++++++++++++- .../server/namenode/ClusterJspHelper.java | 3 + .../hdfs/server/namenode/FSNamesystem.java | 17 ++++ .../hdfs/server/namenode/NameNodeMXBean.java | 25 +++++- .../server/namenode/NamenodeJspHelper.java | 23 +++++- .../src/main/webapps/hdfs/dfshealth.jsp | 1 + 8 files changed, 155 insertions(+), 5 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 644699f67f9..bb937935bdd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -333,6 +333,9 @@ Release 2.1.1-beta - UNRELEASED HDFS-5045. Add more unit tests for retry cache to cover all AtMostOnce methods. (jing9) + HDFS-3245. Add metrics and web UI for cluster version summary. (Ravi + Prakash via kihwal) + OPTIMIZATIONS BUG FIXES diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java index 5172bc59f24..3f5715b0afd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/DatanodeInfo.java @@ -47,6 +47,7 @@ public class DatanodeInfo extends DatanodeID implements Node { private long lastUpdate; private int xceiverCount; private String location = NetworkTopology.DEFAULT_RACK; + private String softwareVersion; // Datanode administrative states public enum AdminStates { @@ -383,4 +384,12 @@ public class DatanodeInfo extends DatanodeID implements Node { // by DatanodeID return (this == obj) || super.equals(obj); } + + public String getSoftwareVersion() { + return softwareVersion; + } + + public void setSoftwareVersion(String softwareVersion) { + this.softwareVersion = softwareVersion; + } } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java index 9d5024fb9b1..f9b7d6edbd2 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/blockmanagement/DatanodeManager.java @@ -26,6 +26,7 @@ import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; +import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.NavigableMap; @@ -64,7 +65,6 @@ import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration; import org.apache.hadoop.hdfs.server.protocol.DisallowedDatanodeException; import org.apache.hadoop.hdfs.server.protocol.RegisterCommand; import org.apache.hadoop.hdfs.util.CyclicIteration; -import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.ipc.Server; import org.apache.hadoop.net.CachedDNSToSwitchMapping; import org.apache.hadoop.net.DNSToSwitchMapping; @@ -165,6 +165,14 @@ public class DatanodeManager { * according to the NetworkTopology. */ private boolean hasClusterEverBeenMultiRack = false; + + /** + * The number of datanodes for each software version. This list should change + * during rolling upgrades. + * Software version -> Number of datanodes with this version + */ + private HashMap datanodesSoftwareVersions = + new HashMap(4, 0.75f); DatanodeManager(final BlockManager blockManager, final Namesystem namesystem, final Configuration conf) throws IOException { @@ -456,6 +464,7 @@ public class DatanodeManager { heartbeatManager.removeDatanode(nodeInfo); blockManager.removeBlocksAssociatedTo(nodeInfo); networktopology.remove(nodeInfo); + decrementVersionCount(nodeInfo.getSoftwareVersion()); if (LOG.isDebugEnabled()) { LOG.debug("remove datanode " + nodeInfo); @@ -538,6 +547,61 @@ public class DatanodeManager { } } + private void incrementVersionCount(String version) { + if (version == null) { + return; + } + synchronized(datanodeMap) { + Integer count = this.datanodesSoftwareVersions.get(version); + count = count == null ? 1 : count + 1; + this.datanodesSoftwareVersions.put(version, count); + } + } + + private void decrementVersionCount(String version) { + if (version == null) { + return; + } + synchronized(datanodeMap) { + Integer count = this.datanodesSoftwareVersions.get(version); + if(count != null) { + if(count > 1) { + this.datanodesSoftwareVersions.put(version, count-1); + } else { + this.datanodesSoftwareVersions.remove(version); + } + } + } + } + + private boolean shouldCountVersion(DatanodeDescriptor node) { + return node.getSoftwareVersion() != null && node.isAlive && + !isDatanodeDead(node); + } + + private void countSoftwareVersions() { + synchronized(datanodeMap) { + HashMap versionCount = new HashMap(); + for(DatanodeDescriptor dn: datanodeMap.values()) { + // Check isAlive too because right after removeDatanode(), + // isDatanodeDead() is still true + if(shouldCountVersion(dn)) + { + Integer num = versionCount.get(dn.getSoftwareVersion()); + num = num == null ? 1 : num+1; + versionCount.put(dn.getSoftwareVersion(), num); + } + } + this.datanodesSoftwareVersions = versionCount; + } + } + + public HashMap getDatanodesSoftwareVersions() { + synchronized(datanodeMap) { + return new HashMap (this.datanodesSoftwareVersions); + } + } + /* Resolve a node's network location */ private String resolveNetworkLocation (DatanodeID node) { List names = new ArrayList(1); @@ -755,21 +819,28 @@ public class DatanodeManager { try { // update cluster map getNetworkTopology().remove(nodeS); + if(shouldCountVersion(nodeS)) { + decrementVersionCount(nodeS.getSoftwareVersion()); + } nodeS.updateRegInfo(nodeReg); + + nodeS.setSoftwareVersion(nodeReg.getSoftwareVersion()); nodeS.setDisallowed(false); // Node is in the include list - + // resolve network location nodeS.setNetworkLocation(resolveNetworkLocation(nodeS)); getNetworkTopology().add(nodeS); // also treat the registration message as a heartbeat heartbeatManager.register(nodeS); + incrementVersionCount(nodeS.getSoftwareVersion()); checkDecommissioning(nodeS); success = true; } finally { if (!success) { removeDatanode(nodeS); wipeDatanode(nodeS); + countSoftwareVersions(); } } return; @@ -793,6 +864,7 @@ public class DatanodeManager { try { nodeDescr.setNetworkLocation(resolveNetworkLocation(nodeDescr)); networktopology.add(nodeDescr); + nodeDescr.setSoftwareVersion(nodeReg.getSoftwareVersion()); // register new datanode addDatanode(nodeDescr); @@ -803,10 +875,12 @@ public class DatanodeManager { // because its is done when the descriptor is created heartbeatManager.addDatanode(nodeDescr); success = true; + incrementVersionCount(nodeReg.getSoftwareVersion()); } finally { if (!success) { removeDatanode(nodeDescr); wipeDatanode(nodeDescr); + countSoftwareVersions(); } } } catch (InvalidTopologyException e) { @@ -828,6 +902,7 @@ public class DatanodeManager { namesystem.writeLock(); try { refreshDatanodes(); + countSoftwareVersions(); } finally { namesystem.writeUnlock(); } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java index 0f0a989f8c9..e158d9442d1 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ClusterJspHelper.java @@ -359,6 +359,7 @@ class ClusterJspHelper { nn.httpAddress = httpAddress; getLiveNodeCount(getProperty(props, "LiveNodes").getValueAsText(), nn); getDeadNodeCount(getProperty(props, "DeadNodes").getValueAsText(), nn); + nn.softwareVersion = getProperty(props, "SoftwareVersion").getTextValue(); return nn; } @@ -596,6 +597,7 @@ class ClusterJspHelper { toXmlItemBlockWithLink(doc, nn.deadDatanodeCount + " (" + nn.deadDecomCount + ")", nn.httpAddress+"/dfsnodelist.jsp?whatNodes=DEAD" , "Dead Datanode (Decommissioned)"); + toXmlItemBlock(doc, "Software Version", nn.softwareVersion); doc.endTag(); // node } doc.endTag(); // namenodes @@ -624,6 +626,7 @@ class ClusterJspHelper { int deadDatanodeCount = 0; int deadDecomCount = 0; String httpAddress = null; + String softwareVersion = ""; } /** diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java index 030893028c4..569f2165c7a 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java @@ -6225,6 +6225,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats, innerinfo.put("nonDfsUsedSpace", node.getNonDfsUsed()); innerinfo.put("capacity", node.getCapacity()); innerinfo.put("numBlocks", node.numBlocks()); + innerinfo.put("version", node.getSoftwareVersion()); info.put(node.getHostName(), innerinfo); } return JSON.toString(info); @@ -6436,6 +6437,22 @@ public class FSNamesystem implements Namesystem, FSClusterStats, return JSON.toString(list); } + @Override //NameNodeMXBean + public int getDistinctVersionCount() { + return blockManager.getDatanodeManager().getDatanodesSoftwareVersions() + .size(); + } + + @Override //NameNodeMXBean + public Map getDistinctVersions() { + return blockManager.getDatanodeManager().getDatanodesSoftwareVersions(); + } + + @Override //NameNodeMXBean + public String getSoftwareVersion() { + return VersionInfo.getVersion(); + } + /** * Verifies that the given identifier and password are valid and match. * @param identifier Token identifier. diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java index 173d5aea4c7..ff2e3ea10dd 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeMXBean.java @@ -17,6 +17,8 @@ */ package org.apache.hadoop.hdfs.server.namenode; +import java.util.Map; + import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; @@ -33,7 +35,13 @@ public interface NameNodeMXBean { * @return the version */ public String getVersion(); - + + /** + * Get the version of software running on the Namenode + * @return a string representing the version + */ + public String getSoftwareVersion(); + /** * Gets the used space by data nodes. * @@ -215,4 +223,19 @@ public interface NameNodeMXBean { * @return the list of corrupt files, as a JSON string. */ public String getCorruptFiles(); + + /** + * Get the number of distinct versions of live datanodes + * + * @return the number of distinct versions of live datanodes + */ + public int getDistinctVersionCount(); + + /** + * Get the number of live datanodes for each distinct versions + * + * @return the number of live datanodes for each distinct versions + */ + public Map getDistinctVersions(); + } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java index 1989784ba53..36163c7d0bf 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeJspHelper.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.List; +import java.util.Map; import javax.servlet.ServletContext; import javax.servlet.http.HttpServletRequest; @@ -99,6 +100,20 @@ class NamenodeJspHelper { } } + static String getRollingUpgradeText(FSNamesystem fsn) { + DatanodeManager dm = fsn.getBlockManager().getDatanodeManager(); + Map list = dm.getDatanodesSoftwareVersions(); + if(list.size() > 1) { + StringBuffer status = new StringBuffer("Rolling upgrades in progress. " + + "There are " + list.size() + " versions of datanodes currently live: "); + for(Map.Entry ver: list.entrySet()) { + status.append(ver.getKey() + "(" + ver.getValue() + "), "); + } + return status.substring(0, status.length()-2); + } + return ""; + } + static String getInodeLimitText(FSNamesystem fsn) { if (fsn == null) { return ""; @@ -802,7 +817,9 @@ class NamenodeJspHelper { + "" + percentBpUsed + "" - + d.getVolumeFailures() + "\n"); + + d.getVolumeFailures() + + "" + + d.getSoftwareVersion() + "\n"); } void generateNodesList(ServletContext context, JspWriter out, @@ -900,7 +917,9 @@ class NamenodeJspHelper { + nodeHeaderStr("pcbpused") + "> Block Pool
Used (%)" + " Failed Volumes\n"); + +"> Failed Volumes Version\n"); JspHelper.sortNodeList(live, sorterField, sorterOrder); for (int i = 0; i < live.size(); i++) { diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp index 1201b95ff75..0b0091e10d6 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/webapps/hdfs/dfshealth.jsp @@ -65,6 +65,7 @@

Cluster Summary

<%= NamenodeJspHelper.getSecurityModeText()%> <%= NamenodeJspHelper.getSafeModeText(fsn)%> + <%= NamenodeJspHelper.getRollingUpgradeText(fsn)%> <%= NamenodeJspHelper.getInodeLimitText(fsn)%> <%= NamenodeJspHelper.getCorruptFilesWarning(fsn)%>