HDDS-38. Add SCMNodeStorage map in SCM class to store storage statistics per Datanode.

Contributed by Shashikant Banerjee.
This commit is contained in:
Anu Engineer 2018-05-17 16:13:28 -07:00
parent e0367d3b24
commit 7c485a6701
6 changed files with 584 additions and 0 deletions

View File

@ -244,6 +244,16 @@ public final class OzoneConfigKeys {
public static final String HDDS_DATANODE_PLUGINS_KEY = public static final String HDDS_DATANODE_PLUGINS_KEY =
"hdds.datanode.plugins"; "hdds.datanode.plugins";
public static final String
HDDS_DATANODE_STORAGE_UTILIZATION_WARNING_THRESHOLD =
"hdds.datanode.storage.utilization.warning.threshold";
public static final double
HDDS_DATANODE_STORAGE_UTILIZATION_WARNING_THRESHOLD_DEFAULT = 0.95;
public static final String
HDDS_DATANODE_STORAGE_UTILIZATION_CRITICAL_THRESHOLD =
"hdds.datanode.storage.utilization.critical.threshold";
public static final double
HDDS_DATANODE_STORAGE_UTILIZATION_CRITICAL_THRESHOLD_DEFAULT = 0.75;
/** /**
* There is no need to instantiate this class. * There is no need to instantiate this class.
*/ */

View File

@ -1057,5 +1057,24 @@
HDDS service starts as part of datanode. HDDS service starts as part of datanode.
</description> </description>
</property> </property>
<property>
<name>hdds.datanode.storage.utilization.warning.threshold</name>
<value>0.75</value>
<tag>OZONE, SCM, MANAGEMENT</tag>
<description>
If a datanode overall storage utilization exceeds more than this
value, a warning will be logged while processing the nodeReport in SCM.
</description>
</property>
<property>
<name>hdds.datanode.storage.utilization.critical.threshold</name>
<value>0.95</value>
<tag>OZONE, SCM, MANAGEMENT</tag>
<description>
If a datanode overall storage utilization exceeds more than this
value, the datanode will be marked out of space.
</description>
</property>
</configuration> </configuration>

View File

@ -136,4 +136,25 @@ public class SCMNodeStat implements NodeStat {
public int hashCode() { public int hashCode() {
return Long.hashCode(capacity.get() ^ scmUsed.get() ^ remaining.get()); return Long.hashCode(capacity.get() ^ scmUsed.get() ^ remaining.get());
} }
/**
* Truncate to 4 digits since uncontrolled precision is some times
* counter intuitive to what users expect.
* @param value - double.
* @return double.
*/
private double truncateDecimals(double value) {
final int multiplier = 10000;
return (double) ((long) (value * multiplier)) / multiplier;
}
/**
* get the scmUsed ratio
*/
public double getScmUsedratio() {
double scmUsedRatio =
truncateDecimals(getScmUsed().get() / (double) getCapacity().get());
return scmUsedRatio;
}
} }

View File

@ -0,0 +1,69 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdds.scm.node;
import org.apache.hadoop.classification.InterfaceAudience;
import java.util.UUID;
/**
*
* This is the JMX management interface for node manager information.
*/
@InterfaceAudience.Private
public interface SCMNodeStorageStatMXBean {
/**
* Get the capacity of the dataNode
* @param datanodeID Datanode Id
* @return long
*/
long getCapacity(UUID datanodeID);
/**
* Returns the remaining space of a Datanode.
* @param datanodeId Datanode Id
* @return long
*/
long getRemainingSpace(UUID datanodeId);
/**
* Returns used space in bytes of a Datanode.
* @return long
*/
long getUsedSpace(UUID datanodeId);
/**
* Returns the total capacity of all dataNodes
* @return long
*/
long getTotalCapacity();
/**
* Returns the total Used Space in all Datanodes.
* @return long
*/
long getTotalSpaceUsed();
/**
* Returns the total Remaining Space in all Datanodes.
* @return long
*/
long getTotalFreeSpace();
}

View File

@ -0,0 +1,277 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with this
* work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*
*/
package org.apache.hadoop.hdds.scm.node;
import com.google.common.base.Preconditions;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.ozone.OzoneConfigKeys;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.management.ObjectName;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.stream.Collectors;
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.DUPLICATE_DATANODE;
import static org.apache.hadoop.hdds.scm.exceptions.SCMException.ResultCodes.NO_SUCH_DATANODE;
/**
* This data structure maintains the disk space capacity, disk usage and free
* space availability per Datanode.
* This information is built from the DN node reports.
*/
public class SCMNodeStorageStatMap implements SCMNodeStorageStatMXBean {
static final Logger LOG =
LoggerFactory.getLogger(SCMNodeStorageStatMap.class);
private final double warningUtilizationThreshold;
private final double criticalUtilizationThreshold;
private final Map<UUID, SCMNodeStat> scmNodeStorageStatMap;
// NodeStorageInfo MXBean
private ObjectName scmNodeStorageInfoBean;
// Aggregated node stats
private SCMNodeStat clusterStat;
/**
* constructs the scmNodeStorageStatMap object
*/
public SCMNodeStorageStatMap(OzoneConfiguration conf) {
scmNodeStorageStatMap = new ConcurrentHashMap<>();
warningUtilizationThreshold = conf.getDouble(
OzoneConfigKeys.
HDDS_DATANODE_STORAGE_UTILIZATION_WARNING_THRESHOLD,
OzoneConfigKeys.
HDDS_DATANODE_STORAGE_UTILIZATION_WARNING_THRESHOLD_DEFAULT);
criticalUtilizationThreshold = conf.getDouble(
OzoneConfigKeys.
HDDS_DATANODE_STORAGE_UTILIZATION_CRITICAL_THRESHOLD,
OzoneConfigKeys.
HDDS_DATANODE_STORAGE_UTILIZATION_CRITICAL_THRESHOLD_DEFAULT);
clusterStat = new SCMNodeStat();
}
public enum UtilizationThreshold {
NORMAL, WARN, CRITICAL;
}
/**
* Returns true if this a datanode that is already tracked by
* scmNodeStorageStatMap.
*
* @param datanodeID - UUID of the Datanode.
* @return True if this is tracked, false if this map does not know about it.
*/
public boolean isKnownDatanode(UUID datanodeID) {
Preconditions.checkNotNull(datanodeID);
return scmNodeStorageStatMap.containsKey(datanodeID);
}
public List<UUID> getDatanodeList(
UtilizationThreshold threshold) {
return scmNodeStorageStatMap.entrySet().stream()
.filter(entry -> (isThresholdReached(threshold, entry.getValue())))
.map(Map.Entry::getKey)
.collect(Collectors.toList());
}
/**
* Insert a new datanode into Node2Container Map.
*
* @param datanodeID -- Datanode UUID
* @param stat - scmNode stat for the Datanode.
*/
public void insertNewDatanode(UUID datanodeID, SCMNodeStat stat)
throws SCMException {
Preconditions.checkNotNull(stat);
Preconditions.checkNotNull(datanodeID);
synchronized (scmNodeStorageStatMap) {
if (isKnownDatanode(datanodeID)) {
throw new SCMException("Node already exists in the map",
DUPLICATE_DATANODE);
}
scmNodeStorageStatMap.put(datanodeID, stat);
clusterStat.add(stat);
}
}
//TODO: This should be called once SCMNodeManager gets Started.
private void registerMXBean() {
this.scmNodeStorageInfoBean = MBeans.register("StorageContainerManager",
"scmNodeStorageInfo", this);
}
//TODO: Unregister call should happen as a part of SCMNodeManager shutdown.
private void unregisterMXBean() {
if(this.scmNodeStorageInfoBean != null) {
MBeans.unregister(this.scmNodeStorageInfoBean);
this.scmNodeStorageInfoBean = null;
}
}
/**
* Updates the Container list of an existing DN.
*
* @param datanodeID - UUID of DN.
* @param stat - scmNode stat for the Datanode.
* @throws SCMException - if we don't know about this datanode, for new DN
* use insertNewDatanode.
*/
public void updateDatanodeMap(UUID datanodeID, SCMNodeStat stat)
throws SCMException {
Preconditions.checkNotNull(datanodeID);
Preconditions.checkNotNull(stat);
synchronized (scmNodeStorageStatMap) {
if (!scmNodeStorageStatMap.containsKey(datanodeID)) {
throw new SCMException("No such datanode", NO_SUCH_DATANODE);
}
SCMNodeStat removed = scmNodeStorageStatMap.get(datanodeID);
clusterStat.subtract(removed);
scmNodeStorageStatMap.put(datanodeID, stat);
clusterStat.add(stat);
}
}
public NodeReportStatus processNodeReport(UUID datanodeID,
StorageContainerDatanodeProtocolProtos.SCMNodeReport nodeReport)
throws SCMException {
Preconditions.checkNotNull(datanodeID);
Preconditions.checkNotNull(nodeReport);
long totalCapacity = 0;
long totalRemaining = 0;
long totalScmUsed = 0;
List<StorageContainerDatanodeProtocolProtos.SCMStorageReport>
storageReports = nodeReport.getStorageReportList();
for (StorageContainerDatanodeProtocolProtos.SCMStorageReport report : storageReports) {
totalCapacity += report.getCapacity();
totalRemaining += report.getRemaining();
totalScmUsed += report.getScmUsed();
}
SCMNodeStat stat = scmNodeStorageStatMap.get(datanodeID);
if (stat == null) {
stat = new SCMNodeStat();
stat.set(totalCapacity, totalScmUsed, totalRemaining);
insertNewDatanode(datanodeID, stat);
} else {
stat.set(totalCapacity, totalScmUsed, totalRemaining);
updateDatanodeMap(datanodeID, stat);
}
if (isThresholdReached(UtilizationThreshold.CRITICAL, stat)) {
LOG.warn("Datanode {} is out of storage space. Capacity: {}, Used: {}",
datanodeID, stat.getCapacity().get(), stat.getScmUsed().get());
return NodeReportStatus.DATANODE_OUT_OF_SPACE;
} else {
if (isThresholdReached(UtilizationThreshold.WARN, stat)) {
LOG.warn("Datanode {} is low on storage space. Capacity: {}, Used: {}",
datanodeID, stat.getCapacity().get(), stat.getScmUsed().get());
}
return NodeReportStatus.ALL_IS_WELL;
}
}
private boolean isThresholdReached(UtilizationThreshold threshold,
SCMNodeStat stat) {
switch (threshold) {
case NORMAL:
return stat.getScmUsedratio() < warningUtilizationThreshold;
case WARN:
return stat.getScmUsedratio() >= warningUtilizationThreshold &&
stat.getScmUsedratio() < criticalUtilizationThreshold;
case CRITICAL:
return stat.getScmUsedratio() >= criticalUtilizationThreshold;
default:
throw new RuntimeException("Unknown UtilizationThreshold value");
}
}
@Override
public long getCapacity(UUID dnId) {
return scmNodeStorageStatMap.get(dnId).getCapacity().get();
}
@Override
public long getRemainingSpace(UUID dnId) {
return scmNodeStorageStatMap.get(dnId).getRemaining().get();
}
@Override
public long getUsedSpace(UUID dnId) {
return scmNodeStorageStatMap.get(dnId).getScmUsed().get();
}
@Override
public long getTotalCapacity() {
return clusterStat.getCapacity().get();
}
@Override
public long getTotalSpaceUsed() {
return clusterStat.getScmUsed().get();
}
@Override
public long getTotalFreeSpace() {
return clusterStat.getRemaining().get();
}
/**
* removes the dataNode from scmNodeStorageStatMap
* @param datanodeID
* @throws SCMException in case the dataNode is not found in the map.
*/
public void removeDatanode(UUID datanodeID) throws SCMException {
Preconditions.checkNotNull(datanodeID);
synchronized (scmNodeStorageStatMap) {
if (!scmNodeStorageStatMap.containsKey(datanodeID)) {
throw new SCMException("No such datanode", NO_SUCH_DATANODE);
}
SCMNodeStat stat = scmNodeStorageStatMap.remove(datanodeID);
clusterStat.subtract(stat);
}
}
/**
* Gets the SCMNodeStat for the datanode
* @param datanodeID
* @return SCMNodeStat
*/
SCMNodeStat getNodeStat(UUID datanodeID) {
return scmNodeStorageStatMap.get(datanodeID);
}
/**
* Results possible from processing a Node report by
* Node2ContainerMapper.
*/
public enum NodeReportStatus {
ALL_IS_WELL,
DATANODE_OUT_OF_SPACE
}
}

View File

@ -0,0 +1,188 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdds.scm.node;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.scm.container.placement.metrics.SCMNodeStat;
import org.apache.hadoop.hdds.scm.exceptions.SCMException;
import org.apache.hadoop.hdds.protocol.proto
.StorageContainerDatanodeProtocolProtos.SCMNodeReport;
import org.apache.hadoop.hdds.protocol.proto
.StorageContainerDatanodeProtocolProtos.SCMStorageReport;
import org.apache.hadoop.ozone.OzoneConsts;
import org.junit.*;
import org.junit.rules.ExpectedException;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
public class TestSCMNodeStorageStatMap {
private final static int DATANODE_COUNT = 300;
final long capacity = 10L * OzoneConsts.GB;
final long used = 2L * OzoneConsts.GB;
final long remaining = capacity - used;
private static OzoneConfiguration conf = new OzoneConfiguration();
private final Map<UUID, SCMNodeStat> testData = new ConcurrentHashMap<>();
@Rule
public ExpectedException thrown = ExpectedException.none();
private void generateData() {
SCMNodeStat stat = new SCMNodeStat();
stat.set(capacity, used, remaining);
for (int dnIndex = 1; dnIndex <= DATANODE_COUNT; dnIndex++) {
testData.put(UUID.randomUUID(), stat);
}
}
private UUID getFirstKey() {
return testData.keySet().iterator().next();
}
@Before
public void setUp() throws Exception {
generateData();
}
@After
public void tearDown() throws Exception {
}
@Test
public void testIsKnownDatanode() throws SCMException {
SCMNodeStorageStatMap map = new SCMNodeStorageStatMap(conf);
UUID knownNode = getFirstKey();
UUID unknownNode = UUID.randomUUID();
SCMNodeStat stat = testData.get(knownNode);
map.insertNewDatanode(knownNode, stat);
Assert.assertTrue("Not able to detect a known node",
map.isKnownDatanode(knownNode));
Assert.assertFalse("Unknown node detected",
map.isKnownDatanode(unknownNode));
}
@Test
public void testInsertNewDatanode() throws SCMException {
SCMNodeStorageStatMap map = new SCMNodeStorageStatMap(conf);
UUID knownNode = getFirstKey();
SCMNodeStat stat = testData.get(knownNode);
map.insertNewDatanode(knownNode, stat);
Assert.assertEquals(map.getNodeStat(knownNode).getScmUsed(),
testData.get(knownNode).getScmUsed());
thrown.expect(SCMException.class);
thrown.expectMessage("already exists");
map.insertNewDatanode(knownNode, stat);
}
@Test
public void testUpdateUnknownDatanode() throws SCMException {
SCMNodeStorageStatMap map = new SCMNodeStorageStatMap(conf);
UUID unknownNode = UUID.randomUUID();
SCMNodeStat stat = new SCMNodeStat();
thrown.expect(SCMException.class);
thrown.expectMessage("No such datanode");
map.updateDatanodeMap(unknownNode, stat);
}
@Test
public void testProcessNodeReportCheckOneNode() throws SCMException {
UUID key = getFirstKey();
SCMNodeStat value = testData.get(key);
SCMNodeStorageStatMap map = new SCMNodeStorageStatMap(conf);
map.insertNewDatanode(key, value);
Assert.assertTrue(map.isKnownDatanode(key));
SCMNodeReport.Builder nrb = SCMNodeReport.newBuilder();
SCMStorageReport.Builder srb = SCMStorageReport.newBuilder();
srb.setStorageUuid(UUID.randomUUID().toString());
srb.setCapacity(value.getCapacity().get())
.setScmUsed(value.getScmUsed().get()).
setRemaining(value.getRemaining().get()).build();
SCMNodeStorageStatMap.NodeReportStatus status =
map.processNodeReport(key, nrb.addStorageReport(srb).build());
Assert.assertEquals(status,
SCMNodeStorageStatMap.NodeReportStatus.ALL_IS_WELL);
}
@Test
public void testProcessNodeReportAndSCMStats() throws SCMException {
SCMNodeStorageStatMap map = new SCMNodeStorageStatMap(conf);
int counter = 1;
// Insert all testData into the SCMNodeStorageStatMap Map.
for (Map.Entry<UUID, SCMNodeStat> keyEntry : testData.entrySet()) {
map.insertNewDatanode(keyEntry.getKey(), keyEntry.getValue());
}
Assert.assertEquals(DATANODE_COUNT * capacity, map.getTotalCapacity());
Assert.assertEquals(DATANODE_COUNT * remaining, map.getTotalFreeSpace());
Assert.assertEquals(DATANODE_COUNT * used, map.getTotalSpaceUsed());
// upadate 1/4th of the datanode to be full
for (Map.Entry<UUID, SCMNodeStat> keyEntry : testData.entrySet()) {
SCMNodeStat stat = new SCMNodeStat(capacity, capacity, 0);
map.updateDatanodeMap(keyEntry.getKey(), stat);
counter++;
if (counter > DATANODE_COUNT / 4) {
break;
}
}
Assert.assertEquals(DATANODE_COUNT / 4,
map.getDatanodeList(SCMNodeStorageStatMap.UtilizationThreshold.CRITICAL)
.size());
Assert.assertEquals(0,
map.getDatanodeList(SCMNodeStorageStatMap.UtilizationThreshold.WARN)
.size());
Assert.assertEquals(0.75 * DATANODE_COUNT,
map.getDatanodeList(SCMNodeStorageStatMap.UtilizationThreshold.NORMAL)
.size(), 0);
Assert.assertEquals(DATANODE_COUNT * capacity, map.getTotalCapacity(), 0);
Assert.assertEquals(0.75 * DATANODE_COUNT * remaining,
map.getTotalFreeSpace(), 0);
Assert.assertEquals(
0.75 * DATANODE_COUNT * used + (0.25 * DATANODE_COUNT * capacity),
map.getTotalSpaceUsed(), 0);
counter = 1;
// Remove 1/4 of the DataNodes from the Map
for (Map.Entry<UUID, SCMNodeStat> keyEntry : testData.entrySet()) {
map.removeDatanode(keyEntry.getKey());
counter++;
if (counter > DATANODE_COUNT / 4) {
break;
}
}
Assert.assertEquals(0,
map.getDatanodeList(SCMNodeStorageStatMap.UtilizationThreshold.CRITICAL)
.size());
Assert.assertEquals(0,
map.getDatanodeList(SCMNodeStorageStatMap.UtilizationThreshold.WARN)
.size());
Assert.assertEquals(0.75 * DATANODE_COUNT,
map.getDatanodeList(SCMNodeStorageStatMap.UtilizationThreshold.NORMAL)
.size(), 0);
Assert.assertEquals(0.75 * DATANODE_COUNT * capacity, map.getTotalCapacity(), 0);
Assert.assertEquals(0.75 * DATANODE_COUNT * remaining,
map.getTotalFreeSpace(), 0);
Assert.assertEquals(
0.75 * DATANODE_COUNT * used ,
map.getTotalSpaceUsed(), 0);
}
}